# Sentiment analysis : Model Training and Prediction 

This notebook walks through the process of training sentiment analysis models for different sources and making predictions.

In [2]:
# import neccessary libraries
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
import re 
import nltk
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pkkar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pkkar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\pkkar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [5]:
train_data= pd.read_csv("C:/sentiment analysis/data/twitter_training.csv",names=['serial_number','source','sentiment','text'])
val_data= pd.read_csv("C:/sentiment analysis/data/twitter_validation.csv",names=['serial_number','source','sentiment','text'])

In [6]:
print(f"Training data shape:{train_data.shape}")
print(f"Validation data shape: {val_data.shape}")
print(f"\nTraining data sample:")
display(train_data.head())

Training data shape:(74682, 4)
Validation data shape: (1000, 4)

Training data sample:


Unnamed: 0,serial_number,source,sentiment,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [7]:
def preprocess_text(text):
    text=str(text).lower()
    text=re.sub(r'[^a-zA-Z\s]','',text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(tokens)
    
    

In [9]:
train_data['processed_text'] = train_data['text'].apply(preprocess_text)

In [12]:
display(train_data[['text','processed_text']])

Unnamed: 0,text,processed_text
0,im getting on borderlands and i will murder yo...,im getting borderlands murder
1,I am coming to the borders and I will kill you...,coming borders kill
2,im getting on borderlands and i will kill you ...,im getting borderlands kill
3,im coming on borderlands and i will murder you...,im coming borderlands murder
4,im getting on borderlands 2 and i will murder ...,im getting borderlands murder
...,...,...
74677,Just realized that the Windows partition of my...,realized windows partition mac like years behi...
74678,Just realized that my Mac window partition is ...,realized mac window partition years behind nvi...
74679,Just realized the windows partition of my Mac ...,realized windows partition mac years behind nv...
74680,Just realized between the windows partition of...,realized windows partition mac like years behi...


In [13]:
val_data['processed_text'] = val_data['text'].apply(preprocess_text)
display(val_data[['text','processed_text']].head())

Unnamed: 0,text,processed_text
0,I mentioned on Facebook that I was struggling ...,mentioned facebook struggling motivation go ru...
1,BBC News - Amazon boss Jeff Bezos rejects clai...,bbc news amazon boss jeff bezos rejects claims...
2,@Microsoft Why do I pay for WORD when it funct...,microsoft pay word functions poorly samsungus ...
3,"CSGO matchmaking is so full of closet hacking,...",csgo matchmaking full closet hacking truly awf...
4,Now the President is slapping Americans in the...,president slapping americans face really commi...


In [21]:
import os 

In [22]:
! pip install joblib



In [24]:
import joblib

In [25]:
def train_source_model(source_data):
    x= source_data['processed_text']
    y= source_data['sentiment']
    
    
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    x=tfidf_vectorizer.fit_transform(x)
    
    model = LinearSVC()
    model.fit(x,y)
    
    return model , tfidf_vectorizer

# train models for each source 
sources = train_data['source'].unique()

if not os.path.exists('models'):
    os.makedirs('models')
    
    
for source in sources :
    print(f"training model for source: {source}")
    source_data = train_data[train_data['source']== source]
    model , vectorizer = train_source_model(source_data)
    
    
    # save the model and vectorizer 
    joblib.dump(model,f'models/{source}_model.joblib')
    joblib.dump(vectorizer,f'models/{source}_vectorizer.joblib')
    
    
    
    
print("training completed models saved in 'models' directory ")
    
    

training model for source: Borderlands
training model for source: CallOfDutyBlackopsColdWar
training model for source: Amazon
training model for source: Overwatch
training model for source: Xbox(Xseries)
training model for source: NBA2K
training model for source: Dota2
training model for source: PlayStation5(PS5)
training model for source: WorldOfCraft
training model for source: CS-GO
training model for source: Google
training model for source: AssassinsCreed
training model for source: ApexLegends
training model for source: LeagueOfLegends
training model for source: Fortnite
training model for source: Microsoft
training model for source: Hearthstone
training model for source: Battlefield
training model for source: PlayerUnknownsBattlegrounds(PUBG)
training model for source: Verizon
training model for source: HomeDepot
training model for source: FIFA
training model for source: RedDeadRedemption(RDR)
training model for source: CallOfDuty
training model for source: TomClancysRainbowSix
tr

In [26]:
def predict_sentiment(text,source):
    model = joblib.load(f'models/{source}_model.joblib')
    vectorizer = joblib.load(f'models/{source}_vectorizer.joblib')
    processed_text  = preprocess_text(text)
    vectorized_text = vectorizer.transform([processed_text])
    prediction = model.predict(vectorized_text)[0]
    return prediction 


# Evaluate on validation set 
val_predictions = []
for _ , row in val_data.iterrows():
    pred = predict_sentiment(row['text'],row['source'])
    val_predictions.append(pred)
    
    
# print classification report 
print(classification_report(val_data['sentiment'],val_predictions ))

              precision    recall  f1-score   support

  Irrelevant       0.99      0.98      0.99       172
    Negative       0.98      0.98      0.98       266
     Neutral       0.99      0.99      0.99       285
    Positive       0.98      0.98      0.98       277

    accuracy                           0.99      1000
   macro avg       0.99      0.99      0.99      1000
weighted avg       0.99      0.99      0.99      1000



In [28]:
from sklearn.metrics import accuracy_score 
print (accuracy_score(val_data['sentiment'],val_predictions))

0.986


In [30]:
sample_texts = [
    ("I love playing Borderlands! Can't wait to kill some skags!", "Borderlands"),
    ("This new graphics card is amazing!","Nvidia"),
    ("Facebook's new privacy policy is concerning.","Facebook"),
    ("The latest Windows update broke my computer.","Microsoft")
]


for text , source in sample_texts:
    try:
        sentiment = predict_sentiment(text,source)
        print(f"Text:'{text}'")
        print(f"Source: {source}")
        print(f"Pridicted sentiment : {sentiment}\n")
    except FileNotFoundError as e : 
        print(e)
        print(f"Text: '{text}'")
        print(f"Source: {source}")
        print(f"Pridicted sentiment : Unable to predict (model not found )\n")
        
    

Text:'I love playing Borderlands! Can't wait to kill some skags!'
Source: Borderlands
Pridicted sentiment : Positive

Text:'This new graphics card is amazing!'
Source: Nvidia
Pridicted sentiment : Positive

Text:'Facebook's new privacy policy is concerning.'
Source: Facebook
Pridicted sentiment : Neutral

Text:'The latest Windows update broke my computer.'
Source: Microsoft
Pridicted sentiment : Negative

