<a href="https://colab.research.google.com/github/rep44-22/Sentimental-Analysis/blob/main/logregression_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   U

True

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import joblib

dataset = pd.read_csv('toxicity_en.csv')


from sklearn.utils import shuffle

dataset = shuffle(dataset)


print(dataset.head(10))

# Text preprocessing
def preprocess_text(text):
    text = ' '.join([word for word in text.split() if 'http' not in word])
    text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text).split())
    text = text.lower()
    tokens = word_tokenize(text) # Tokenization converts a continuous stream of text into discrete units, making it easier to work with and analyze.
    stop_words = set(stopwords.words('english')) # remove stopwords
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    lemmatizer = WordNetLemmatizer() # to reduce words to their root words
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

#Apply preprocessing to the 'text' column
dataset['text'] = dataset['text'].apply(preprocess_text)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    dataset['text'], dataset['label'], test_size=0.2, random_state=42, stratify=dataset['label']
)

#Create TF-IDF vectorizer,
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Define hyperparameter
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}


lr_classifier = LogisticRegression(random_state=42)

#Performing grid search
grid_search = GridSearchCV(lr_classifier, param_grid, scoring='precision', cv=5, refit=True)
grid_search.fit(X_train_tfidf, y_train)

#Printing best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

#Save best logistic regression model and TF-IDF vectorizer
best_lr_model = grid_search.best_estimator_
joblib.dump(best_lr_model, 'best_logistic_regression_model.joblib')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')

#predictions on test set
X_test_tfidf = tfidf_vectorizer.transform(X_test)
y_pred = best_lr_model.predict(X_test_tfidf)

#model evaluation
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("\nModel Evaluation:")
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_rep)


                                                  text  label
44   THE TRIGGERED LEFTIES ARE ALL PULLING OUT ALL ...      1
83   JR you deserve to be in HELL or Prison !!!! YO...      1
242  I’m with you on the ridiculousness that Chris ...      1
301  Soros you antichrist demon you will have a spe...      1
446         These dudes are retarded. Lets go Brandon!      1
789  My favorite part of that was they intentionall...      0
508  you spin me right round,\n \n \n \n baby right...      0
288                 🔻👽👁🔻\n ANTI ASIAN and LOVING it!!!      1
796  Marvin Gaye- Falling in love again Good song, ...      0
732  This is what it feels like being a gymnastics ...      0
Best Hyperparameters: {'C': 1}

Model Evaluation:
Accuracy: 0.845
Confusion Matrix:
 [[87 13]
 [18 82]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.87      0.85       100
           1       0.86      0.82      0.84       100

    accuracy              

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install anvil-uplink
import anvil.server

Collecting anvil-uplink
  Downloading anvil_uplink-0.4.2-py2.py3-none-any.whl (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.1/90.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting argparse (from anvil-uplink)
  Downloading argparse-1.4.0-py2.py3-none-any.whl (23 kB)
Collecting ws4py (from anvil-uplink)
  Downloading ws4py-0.5.1.tar.gz (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.4/51.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ws4py
  Building wheel for ws4py (setup.py) ... [?25l[?25hdone
  Created wheel for ws4py: filename=ws4py-0.5.1-py3-none-any.whl size=45228 sha256=d496f2e2682e6eb76a8630f54c34628037709f954bf63cd52fa1942f098156ca
  Stored in directory: /root/.cache/pip/wheels/2e/7c/ad/d9c746276bf024d44296340869fcb169f1e5d80fb147351a57
Successfully built ws4py
Installing collected packages: ws4py, argparse, 

In [None]:
anvil.server.connect("server_Z5RN7FMKSVSNTCMXF2JTYWEQ-4DI3BEL2M7ZSTB6Y")

Connecting to wss://anvil.works/uplink
Anvil websocket open
Connected to "Default Environment" as SERVER


In [None]:
@anvil.server.callable
def predict_tweet(tweet):
    # Preprocess the input tweet
    preprocessed_tweet = preprocess_text(tweet)


    # Vectorize the preprocessed tweet
    vectorized_tweet = tfidf_vectorizer.transform([preprocessed_tweet])

    # Make the prediction using the logistic regression model
    prediction = best_lr_model.predict(vectorized_tweet)[0]

    # Return the prediction (1 for abusive, 0 for non-abusive)
    return prediction

In [None]:
anvil.server.wait_forever()