<a href="https://colab.research.google.com/github/saqibkaka/PRODIGY_DS_TASK/blob/main/PRODIGY_DS_TASK_04/Twitter_Sentiment_Analysis_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import numpy as np
import pandas as pd

import spacy

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

import warnings
warnings.filterwarnings('ignore')

In [15]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# EDA

In [16]:
# Read the dataset with name "Emotion_classify_Data.csv" and store it in a variable df
columns = ['id', 'country', 'Label', 'Text']
df = pd.read_csv('/content/drive/MyDrive/new/twitter_training.csv', names= columns)

# Print the shape of dataframe
print(df.shape)

# Print top 5 rows
df.head(5)

(74682, 4)


Unnamed: 0,id,country,Label,Text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       74682 non-null  int64 
 1   country  74682 non-null  object
 2   Label    74682 non-null  object
 3   Text     73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [18]:
# Check the distribution of Emotion
df['Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
Negative,22542
Positive,20832
Neutral,18318
Irrelevant,12990


In [19]:
# Show sample
for i in range(5):
    print(f'{i+1}:' , df['Text'][i], '->', df['Label'][i])

1: im getting on borderlands and i will murder you all , -> Positive
2: I am coming to the borders and I will kill you all, -> Positive
3: im getting on borderlands and i will kill you all, -> Positive
4: im coming on borderlands and i will murder you all, -> Positive
5: im getting on borderlands 2 and i will murder you me all, -> Positive


# Preprocessing

## Drop nan values

In [20]:
df.dropna(inplace=True)

In [21]:
df.shape

(73996, 4)

# Preprocess Function

In [22]:
# load english language model and create nlp object from it
nlp = spacy.load('en_core_web_sm')

In [23]:
# use this utility function to get the preprocessed text data
def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)

    return " ".join(filtered_tokens)

# Apply preprocess function on dataframe

In [24]:
df['Preprocessed_Text'] = df['Text'].apply(preprocess)

In [25]:
df

Unnamed: 0,id,country,Label,Text,Preprocessed_Text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,m get borderland murder
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,come border kill
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,m get borderland kill
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,m come borderland murder
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,m get borderland 2 murder
...,...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...,realize Windows partition Mac like 6 year Nvid...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...,realize Mac window partition 6 year Nvidia dri...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...,realize window partition Mac 6 year Nvidia dri...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...,realize window partition Mac like 6 year Nvidi...


## Encoding target column


In [26]:
encoder = LabelEncoder()
df['Label_num'] = encoder.fit_transform(df['Label'])

In [27]:
df.head(5)

Unnamed: 0,id,country,Label,Text,Preprocessed_Text,Label_num
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,m get borderland murder,3
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,come border kill,3
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,m get borderland kill,3
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,m come borderland murder,3
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,m get borderland 2 murder,3


## Split data into train and test

In [28]:
X_train, X_test, y_train, y_test = train_test_split(df['Preprocessed_Text'], df['Label_num'],
                                                   test_size=.2, random_state=123, stratify=df['Label_num'])

In [29]:
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

Shape of X_train:  (59196,)
Shape of X_test:  (14800,)


## Machine Learning Model

### Naive Bayes

In [30]:
# Create classifier
clf = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('naive_bayes', (MultinomialNB()))
])

In [31]:
# Model training
clf.fit(X_train, y_train)

In [32]:
# Get prediction
y_pred = clf.predict(X_test)

In [33]:
# Print score
print(accuracy_score(y_test, y_pred))

0.7283108108108108


In [34]:
# Print classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.45      0.61      2575
           1       0.66      0.89      0.76      4472
           2       0.83      0.64      0.72      3622
           3       0.70      0.80      0.75      4131

    accuracy                           0.73     14800
   macro avg       0.78      0.70      0.71     14800
weighted avg       0.76      0.73      0.72     14800



## Random Fores

In [35]:
# Create classifier
clr = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('random_forest', (RandomForestClassifier()))
])

In [36]:
# Model training
clr.fit(X_train, y_train)

In [37]:
# Get prediction
y_pred = clr.predict(X_test)

In [38]:
# Print score
print(accuracy_score(y_test, y_pred))

0.9125675675675675


In [39]:
# Print classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.85      0.90      2575
           1       0.93      0.93      0.93      4472
           2       0.94      0.90      0.92      3622
           3       0.85      0.94      0.89      4131

    accuracy                           0.91     14800
   macro avg       0.92      0.91      0.91     14800
weighted avg       0.92      0.91      0.91     14800



# Test Model

## Get text

In [40]:
test_df = pd.read_csv('/content/drive/MyDrive/new/twitter_validation.csv', names=columns)
test_df.head()

Unnamed: 0,id,country,Label,Text
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [41]:
test_text = test_df['Text'][10]
print(f"{test_text} ===> {test_df['Label'][10]}")

The professional dota 2 scene is fucking exploding and I completely welcome it.

Get the garbage out. ===> Positive


## Apply preprocess

In [42]:
test_text_processed = [preprocess(test_text)]
test_text_processed

['professional dota 2 scene fucking explode completely welcome \n\n garbage']

## Get Prediction

In [43]:
test_text = clr.predict(test_text_processed)

## Output

In [44]:
classes = ['Irrelevant', 'Natural', 'Negative', 'Positive']

print(f"True Label: {test_df['Label'][10]}")
print(f'Predict Label: {classes[test_text[0]]}')

True Label: Positive
Predict Label: Positive
