In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
import pickle
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saisn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saisn\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [3]:
train = pd.read_csv("C:/Users/saisn/OneDrive/Desktop/twitter_training.csv")
validation = pd.read_csv("C:/Users/saisn/OneDrive/Desktop/twitter_validation.csv")

In [4]:
train.columns = ['id','bdlands','type','tweet']
train.head()

Unnamed: 0,id,bdlands,type,tweet
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [5]:
validation.columns = ['id','bdlands','type','tweet']
validation.head()

Unnamed: 0,id,bdlands,type,tweet
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...


In [6]:
train_data=train
train_data

Unnamed: 0,id,bdlands,type,tweet
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...


In [7]:
val_data=validation
val_data

Unnamed: 0,id,bdlands,type,tweet
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...
...,...,...,...,...
994,4891,GrandTheftAuto(GTA),Irrelevant,⭐️ Toronto is the arts and culture capital of ...
995,4359,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
996,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...
997,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.


In [8]:
#Text transformation
train_data["lower"]=train_data.tweet.str.lower() #lowercase
train_data["lower"]=[str(data) for data in train_data.lower] #converting all to string
train_data["lower"]=train_data.lower.apply(lambda x: re.sub('[^A-Za-z0-9 ]+', ' ', x)) #regex
val_data["lower"]=val_data.tweet.str.lower() #lowercase
val_data["lower"]=[str(data) for data in val_data.lower] #converting all to string
val_data["lower"]=val_data.lower.apply(lambda x: re.sub('[^A-Za-z0-9 ]+', ' ', x)) #regex

In [9]:
train_data.head()

Unnamed: 0,id,bdlands,type,tweet,lower
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,i am coming to the borders and i will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,im getting into borderlands and i can murder y...


In [10]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

In [11]:
def preprocess_text(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove stop words and perform stemming
    processed_text = [stemmer.stem(word) for word in tokens if word.lower() not in stop_words]
    
    # Join the processed words back into a single string
    processed_text = ' '.join(processed_text)
    
    return processed_text

In [12]:
train_data['processed_text'] = train_data['lower'].apply(preprocess_text)

In [13]:
X = train_data['processed_text']
y = train_data['type']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [16]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_tfidf, y_train)

In [17]:
y_pred = logreg.predict(X_test_tfidf)

In [18]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

  Irrelevant       0.80      0.66      0.73      2661
    Negative       0.79      0.82      0.81      4471
     Neutral       0.77      0.73      0.75      3551
    Positive       0.74      0.82      0.78      4254

    accuracy                           0.77     14937
   macro avg       0.78      0.76      0.76     14937
weighted avg       0.77      0.77      0.77     14937



In [19]:
val_data['processed_text'] = val_data['lower'].apply(preprocess_text)

In [20]:
X_val_tfidf = tfidf_vectorizer.transform(val_data['processed_text'])
y_val_pred = logreg.predict(X_val_tfidf)
print(classification_report(val_data['type'], y_val_pred))

              precision    recall  f1-score   support

  Irrelevant       0.88      0.84      0.86       171
    Negative       0.84      0.91      0.88       266
     Neutral       0.90      0.86      0.88       285
    Positive       0.88      0.89      0.88       277

    accuracy                           0.88       999
   macro avg       0.88      0.87      0.87       999
weighted avg       0.88      0.88      0.88       999



In [21]:
filename = 'logistic_regression_model.pkl'
pickle.dump(logreg, open(filename, 'wb'))
pickle.dump(tfidf_vectorizer, open('tfidf_vectorizer.pkl', 'wb'))