Source : https://youtu.be/sLYSY3Ya6Yk?si=7Ocst2t2U4AWZx-v

**Sentiment Analysis : Model Training and Prediction**

This notebook walks through the process of training sentiment analysis models for different sources and making predictions.

In [2]:
#importing the depedenecies
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import joblib
import os

In [3]:
#download nltk data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
#loading the dataset
train_data = pd.read_csv('/content/twitter_training.csv',names=['Id','Source','Sentiment','Tweet'])
val_data = pd.read_csv('/content/twitter_validation.csv',names=['Id','Source','Sentiment','Tweet'])

In [5]:
print("Training Data Shape:",train_data.shape)
print("Validation Data Shape:",val_data.shape)

Training Data Shape: (74682, 4)
Validation Data Shape: (1000, 4)


In [6]:
train_data.head()

Unnamed: 0,Id,Source,Sentiment,Tweet
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [7]:
val_data.head()

Unnamed: 0,Id,Source,Sentiment,Tweet
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [8]:
def preprocess(text):
  text = str(text).lower()
  text = re.sub(r'[^a-zA-Z\s]', '', text)
  tokens = word_tokenize(text)
  stop_words = set(stopwords.words('english'))
  filtered_tokens = [token for token in tokens if token not in stop_words]
  preprocessed_text = ' '.join(filtered_tokens)
  return preprocessed_text

In [9]:
text = "@Microsoft Why do I pay for WORD when it funct.."
preprocess(text)

'microsoft pay word funct'

In [12]:
train_data['Preprocessed_Text'] = train_data['Tweet'].apply(preprocess)

In [13]:
train_data.head(10)

Unnamed: 0,Id,Source,Sentiment,Tweet,Preprocessed_Text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,im getting borderlands murder
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,coming borders kill
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,im getting borderlands kill
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,im coming borderlands murder
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,im getting borderlands murder
5,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,im getting borderlands murder
6,2402,Borderlands,Positive,So I spent a few hours making something for fu...,spent hours making something fun dont know hug...
7,2402,Borderlands,Positive,So I spent a couple of hours doing something f...,spent couple hours something fun dont know im ...
8,2402,Borderlands,Positive,So I spent a few hours doing something for fun...,spent hours something fun dont know im huge bo...
9,2402,Borderlands,Positive,So I spent a few hours making something for fu...,spent hours making something fun dont know hug...


In [14]:
val_data['Preprocessed_Text'] = val_data['Tweet'].apply(preprocess)

In [16]:
val_data.head(10)

Unnamed: 0,Id,Source,Sentiment,Tweet,Preprocessed_Text
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...,mentioned facebook struggling motivation go ru...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...,bbc news amazon boss jeff bezos rejects claims...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...,microsoft pay word functions poorly samsungus ...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,...",csgo matchmaking full closet hacking truly awf...
4,4433,Google,Neutral,Now the President is slapping Americans in the...,president slapping americans face really commi...
5,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...,hi eahelp ive madeleine mccann cellar past yea...
6,7925,MaddenNFL,Positive,Thank you @EAMaddenNFL!! \n\nNew TE Austin Hoo...,thank eamaddennfl new te austin hooper orange ...
7,11332,TomClancysRainbowSix,Positive,"Rocket League, Sea of Thieves or Rainbow Six: ...",rocket league sea thieves rainbow six siege lo...
8,1107,AssassinsCreed,Positive,my ass still knee-deep in Assassins Creed Odys...,ass still kneedeep assassins creed odyssey way...
9,2069,CallOfDuty,Negative,FIX IT JESUS ! Please FIX IT ! What In the wor...,fix jesus please fix world going playstation a...


In [17]:
def train_src_model(src_data):
  X = src_data['Preprocessed_Text']
  y = src_data['Sentiment']

  tfidf_vectorizer = TfidfVectorizer(max_features=5000)
  X = tfidf_vectorizer.fit_transform(X)

  model = LinearSVC()
  model.fit(X, y)

  return model, tfidf_vectorizer

In [19]:
#Training model with each source
sources = train_data['Source'].unique()

if not os.path.exists('models'):
  os.makedirs('models')

for source in sources:
  print("Training Model for Source:", source)
  source_data = train_data[train_data['Source'] == source]
  model , vectorizer = train_src_model(source_data)

  #save the model and vectorizer
  joblib.dump(model, 'models/model_'+source+'.joblib')
  joblib.dump(vectorizer, 'models/vectorizer_'+source+'.joblib')

print("All models trained and saved successfully")

Training Model for Source: Borderlands
Training Model for Source: CallOfDutyBlackopsColdWar
Training Model for Source: Amazon
Training Model for Source: Overwatch
Training Model for Source: Xbox(Xseries)
Training Model for Source: NBA2K
Training Model for Source: Dota2
Training Model for Source: PlayStation5(PS5)
Training Model for Source: WorldOfCraft
Training Model for Source: CS-GO
Training Model for Source: Google
Training Model for Source: AssassinsCreed
Training Model for Source: ApexLegends
Training Model for Source: LeagueOfLegends
Training Model for Source: Fortnite
Training Model for Source: Microsoft
Training Model for Source: Hearthstone
Training Model for Source: Battlefield
Training Model for Source: PlayerUnknownsBattlegrounds(PUBG)
Training Model for Source: Verizon
Training Model for Source: HomeDepot
Training Model for Source: FIFA
Training Model for Source: RedDeadRedemption(RDR)
Training Model for Source: CallOfDuty
Training Model for Source: TomClancysRainbowSix
Tr

In [20]:
def predict_sentiment(text,source):
  model = joblib.load('models/model_'+source+'.joblib')
  vectorizer = joblib.load('models/vectorizer_'+source+'.joblib')
  preprocessed_text = preprocess(text)
  text_vectorized = vectorizer.transform([preprocessed_text])
  prediction = model.predict(text_vectorized)
  return prediction[0]

In [22]:
val_predictions = []

for _ , row in val_data.iterrows():
  pred = predict_sentiment(row['Tweet'],row['Source'])
  val_predictions.append(pred)

In [24]:
print(classification_report(val_data['Sentiment'],val_predictions))

              precision    recall  f1-score   support

  Irrelevant       0.99      0.98      0.99       172
    Negative       0.98      0.98      0.98       266
     Neutral       0.99      0.99      0.99       285
    Positive       0.98      0.98      0.98       277

    accuracy                           0.99      1000
   macro avg       0.99      0.99      0.99      1000
weighted avg       0.99      0.99      0.99      1000



In [26]:
#import confusion matrix
from sklearn.metrics import multilabel_confusion_matrix

#display results in confusion matrix
print(multilabel_confusion_matrix(val_data['Sentiment'],val_predictions))

[[[827   1]
  [  3 169]]

 [[730   4]
  [  4 262]]

 [[711   4]
  [  2 283]]

 [[718   5]
  [  5 272]]]


In [27]:
from sklearn.metrics import accuracy_score

print(accuracy_score(val_data['Sentiment'],val_predictions))

0.986


In [28]:
sample_texts = [
    ("I love playing Borderlands!! Can't wait to kill some skags!","Borderlands"),
    ("This new graphics card is amazing!!","Nvidia"),
    ("Facebook's new privacy policy is concerning.","Facebook"),
    ("The latest Windows update broke my computer","Microsoft")
]

for text,source in sample_texts:
    try:
      sentiment = predict_sentiment(text,source)
      print(f"Text:'{text}'")
      print(f"Source: {source}")
      print(f"Sentiment: {sentiment}")
      print("="*50)
    except Exception as e:
      print(f"Error processing text: {text}")
      print(f"Error: {e}")
      print("="*50)

Text:'I love playing Borderlands!! Can't wait to kill some skags!'
Source: Borderlands
Sentiment: Positive
Text:'This new graphics card is amazing!!'
Source: Nvidia
Sentiment: Positive
Text:'Facebook's new privacy policy is concerning.'
Source: Facebook
Sentiment: Neutral
Text:'The latest Windows update broke my computer'
Source: Microsoft
Sentiment: Negative
