## Sentiment Analysis: Model Training and Predictions

This notebook walks through the process of training sentiment analysis models for different sources and making predictions

In [1]:
# Import neccessary libraries

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import joblib
import os

In [2]:
# Downlode necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to C:\Users\Vandit
[nltk_data]     sharma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Vandit
[nltk_data]     sharma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Vandit
[nltk_data]     sharma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
#Load the datasets

train_data = pd.read_csv("D:/NLP Sentiment Analysis/data/twitter_training.csv", names=['serial_number',"Source","Sentiment","Text"])
val_data = pd.read_csv("D:/NLP Sentiment Analysis/data/twitter_validation.csv", names=['serial_number',"Source","Sentiment","Text"])

In [4]:
print(f"Training data shape: {train_data.shape}")
display(train_data.head())

Training data shape: (74682, 4)


Unnamed: 0,serial_number,Source,Sentiment,Text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [5]:
print(f"validation data shape: {val_data.shape}")
display(val_data.head())

validation data shape: (1000, 4)


Unnamed: 0,serial_number,Source,Sentiment,Text
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [6]:
train_data.columns

Index(['serial_number', 'Source', 'Sentiment', 'Text'], dtype='object')

In [7]:
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]','',text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(tokens)

In [8]:
train_data['Processed_Text'] = train_data['Text'].apply(preprocess_text)
display(train_data[['Text','Processed_Text']])

Unnamed: 0,Text,Processed_Text
0,im getting on borderlands and i will murder yo...,im getting borderlands murder
1,I am coming to the borders and I will kill you...,coming borders kill
2,im getting on borderlands and i will kill you ...,im getting borderlands kill
3,im coming on borderlands and i will murder you...,im coming borderlands murder
4,im getting on borderlands 2 and i will murder ...,im getting borderlands murder
...,...,...
74677,Just realized that the Windows partition of my...,realized windows partition mac like years behi...
74678,Just realized that my Mac window partition is ...,realized mac window partition years behind nvi...
74679,Just realized the windows partition of my Mac ...,realized windows partition mac years behind nv...
74680,Just realized between the windows partition of...,realized windows partition mac like years behi...


In [9]:
val_data['Processed_Text'] = val_data['Text'].apply(preprocess_text)
display(val_data[['Text','Processed_Text']].head())

Unnamed: 0,Text,Processed_Text
0,I mentioned on Facebook that I was struggling ...,mentioned facebook struggling motivation go ru...
1,BBC News - Amazon boss Jeff Bezos rejects clai...,bbc news amazon boss jeff bezos rejects claims...
2,@Microsoft Why do I pay for WORD when it funct...,microsoft pay word functions poorly samsungus ...
3,"CSGO matchmaking is so full of closet hacking,...",csgo matchmaking full closet hacking truly awf...
4,Now the President is slapping Americans in the...,president slapping americans face really commi...


In [10]:
sources = train_data['Source'].unique()
sources

array(['Borderlands', 'CallOfDutyBlackopsColdWar', 'Amazon', 'Overwatch',
       'Xbox(Xseries)', 'NBA2K', 'Dota2', 'PlayStation5(PS5)',
       'WorldOfCraft', 'CS-GO', 'Google', 'AssassinsCreed', 'ApexLegends',
       'LeagueOfLegends', 'Fortnite', 'Microsoft', 'Hearthstone',
       'Battlefield', 'PlayerUnknownsBattlegrounds(PUBG)', 'Verizon',
       'HomeDepot', 'FIFA', 'RedDeadRedemption(RDR)', 'CallOfDuty',
       'TomClancysRainbowSix', 'Facebook', 'GrandTheftAuto(GTA)',
       'MaddenNFL', 'johnson&johnson', 'Cyberpunk2077',
       'TomClancysGhostRecon', 'Nvidia'], dtype=object)

In [11]:
def train_source_model(source_data):
    X = source_data['Processed_Text']
    y = source_data['Sentiment']
    
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    X = tfidf_vectorizer.fit_transform(X)
    
    model = LinearSVC()
    model.fit(X, y)
    
    return model, tfidf_vectorizer

# Train models for each source
sources = train_data['Source'].unique()

if not os.path.exists('models'):
    os.makedirs('models')

for source in sources:
    print(f"Training model for source: {source}")
    source_data = train_data[train_data['Source'] == source]
    model, vectorizer = train_source_model(source_data)
    
    # Save the model and vectorizer
    joblib.dump(model, f'models/{source}_model.joblib')
    joblib.dump(vectorizer, f'models/{source}_vectorizer.joblib')

print("Training completed. Models saved in 'models' directory.")

Training model for source: Borderlands
Training model for source: CallOfDutyBlackopsColdWar




Training model for source: Amazon
Training model for source: Overwatch




Training model for source: Xbox(Xseries)
Training model for source: NBA2K




Training model for source: Dota2
Training model for source: PlayStation5(PS5)




Training model for source: WorldOfCraft
Training model for source: CS-GO




Training model for source: Google
Training model for source: AssassinsCreed




Training model for source: ApexLegends
Training model for source: LeagueOfLegends




Training model for source: Fortnite
Training model for source: Microsoft




Training model for source: Hearthstone
Training model for source: Battlefield




Training model for source: PlayerUnknownsBattlegrounds(PUBG)
Training model for source: Verizon




Training model for source: HomeDepot
Training model for source: FIFA




Training model for source: RedDeadRedemption(RDR)
Training model for source: CallOfDuty




Training model for source: TomClancysRainbowSix
Training model for source: Facebook




Training model for source: GrandTheftAuto(GTA)
Training model for source: MaddenNFL




Training model for source: johnson&johnson
Training model for source: Cyberpunk2077
Training model for source: TomClancysGhostRecon




Training model for source: Nvidia
Training completed. Models saved in 'models' directory.




In [12]:
def predict_sentiment(text, source):
    model = joblib.load(f'models/{source}_model.joblib')
    vectorizer = joblib.load(f'models/{source}_vectorizer.joblib')
    processed_text = preprocess_text(text)
    vectorized_text = vectorizer.transform([processed_text])
    prediction = model.predict(vectorized_text)[0]
    return prediction

# Evaluate on validation set
val_predictions = []
for _, row in val_data.iterrows():
    pred = predict_sentiment(row['Text'], row['Source'])
    val_predictions.append(pred)

# Print classification report
print(classification_report(val_data['Sentiment'], val_predictions))

              precision    recall  f1-score   support

  Irrelevant       0.99      0.98      0.99       172
    Negative       0.98      0.98      0.98       266
     Neutral       0.99      0.99      0.99       285
    Positive       0.98      0.98      0.98       277

    accuracy                           0.99      1000
   macro avg       0.99      0.99      0.99      1000
weighted avg       0.99      0.99      0.99      1000



In [13]:
print(len(val_data['Sentiment']), len(val_predictions))


1000 1000


In [14]:
from sklearn.metrics import accuracy_score

In [15]:
sample_texts = [
    ("I love playing Borderlands! Can't wait to kill some skags!", "Borderlands"),
    ("This new graphics card is amazing!", "Nvidia"),
    ("Facebook's new privacy policy is concerning.", "Facebook"),
    ("The latest Windows update broke my computer.", "Microsoft")
]

for text, source in sample_texts:
    try:
        sentiment = predict_sentiment(text, source)
        print(f"Text: '{text}'")
        print(f"Source: {source}")
        print(f"Predicted sentiment: {sentiment}\n")
    except FileNotFoundError as e:
        print(e)
        print(f"Text: '{text}'")
        print(f"Source: {source}")
        print("Predicted sentiment: Unable to predict (model not found)\n")

Text: 'I love playing Borderlands! Can't wait to kill some skags!'
Source: Borderlands
Predicted sentiment: Positive

Text: 'This new graphics card is amazing!'
Source: Nvidia
Predicted sentiment: Positive

Text: 'Facebook's new privacy policy is concerning.'
Source: Facebook
Predicted sentiment: Neutral

Text: 'The latest Windows update broke my computer.'
Source: Microsoft
Predicted sentiment: Negative

