In [83]:
# Install Kaggle library
!pip install kaggle



In [84]:
# Configure the path of kaggle.json file
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory


Importing Twitter Sentiment Dataset


In [85]:
# API to fetch the dataset frm kaggle
!kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
sentiment140.zip: Skipping, found more recently modified local copy (use --force to force download)


In [86]:
# Extracting the compressed dataset
from zipfile import ZipFile
dataset = '/content/sentiment140.zip'

with ZipFile(dataset, 'r') as file:
  file.extractall()
  print('The dataset is extracted')

The dataset is extracted


Importing the dependencies

In [87]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [88]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [89]:
# Printing the stopwords in english
words = stopwords.words('english')
print(set(words))

{'then', "should've", 'mightn', 'ain', 've', 'over', 'were', 'had', "hasn't", "you've", 'more', 'to', 'should', 'hers', 'been', 'doesn', 'do', 'has', 'these', 'no', 'own', 'their', 'yourselves', 'up', 'doing', 'wasn', 'having', 'didn', 'if', "won't", 'during', "aren't", 'on', 'she', 'm', 'than', "mightn't", 'between', 'ours', 'he', 'aren', 'once', 'under', "hadn't", 'hasn', 'them', 'll', 'an', "haven't", 'we', 'herself', 'himself', 'yours', 'does', "you'll", 'so', 'me', 'o', 'weren', "weren't", 'ma', 'i', 'theirs', "shan't", 'won', 'our', 'a', 'into', 'themselves', 'be', 'other', 'until', 'wouldn', "wouldn't", 'because', 'again', 'above', 'you', 'below', 'but', 'very', 'her', 'where', 'who', 'here', 'or', 'in', 'when', 's', 'about', 'myself', 'ourselves', "wasn't", 'needn', 'any', 'down', "couldn't", 'mustn', 'don', 'through', 'is', 'the', 'most', 'your', 'there', 'all', 'both', 'that', 'further', 'have', "she's", 'itself', 't', 'as', 'against', "didn't", 'hadn', 'for', 'they', 'this',

#Data Processing

In [93]:
# Loading the data from csv file to pandas dataframe
twitter_dataset = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', encoding='latin-1')

In [94]:
# Printing the first five rows of dataframe
twitter_dataset.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [95]:
# Checking the no. of rows and columns
twitter_dataset.shape

(1599999, 6)

In [97]:
# Naming the columns and reading the dataset again
column_name = ['target','ids','date','flag','user','text']

twitter_dataset = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', names=column_name, encoding='latin-1')
twitter_dataset.shape

(1600000, 6)

In [98]:
# Printing the first five rows of dataframe
twitter_dataset.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [99]:
# Data Profiling
twitter_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   target  1600000 non-null  int64 
 1   ids     1600000 non-null  int64 
 2   date    1600000 non-null  object
 3   flag    1600000 non-null  object
 4   user    1600000 non-null  object
 5   text    1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [101]:
# Checking present any null values in the dataet
twitter_dataset.isnull().sum()

Unnamed: 0,0
target,0
ids,0
date,0
flag,0
user,0
text,0


In [102]:
# Checking present any duplicate values in the dataset
twitter_dataset.duplicated().sum()

0

In [103]:
# Checking the distribustion in the target column
twitter_dataset['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
4,800000


Converting the target 4 to 1

In [104]:
# Replacing the value 4 to 1 in the target column
twitter_dataset.replace({'target':{4:1}}, inplace=True)

In [105]:
# Checking the distribustion in the target column
twitter_dataset['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
1,800000




*   0 ----> Negative Tweet
*   1 ----> Positive Tweet



# Stemming:

Stemming is the process of reducing the word to its root word

Example : actor, acting, actress = act (root word)

In [108]:
# Calling porter stemmer and wordnet lemmatizer
port_stem = PorterStemmer()
wnl = WordNetLemmatizer()

In [110]:
# Finding the root word for each word present in the text using PorterStemmer.
def port_stemmer(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ',content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if word not in words]
  stemmed_content = set(stemmed_content)
  stemmed_content = ' '.join(stemmed_content)

  return stemmed_content

In [111]:
# Finding the root word for each word present in the text using WordNetLemmatizer.
def wnl_stemmer(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ',content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [wnl.lemmatize(word, pos='v') for word in stemmed_content if word not in words]
  stemmed_content = ' '.join(stemmed_content)

  return stemmed_content

In [113]:
# Applying the port_stemmer function to the text column
twitter_dataset['port_stemmed_text'] = twitter_dataset['text'].apply(port_stemmer)

In [114]:
# Applying the port_stemmer function to the text column
twitter_dataset['wnl_stemmed_text'] = twitter_dataset['text'].apply(wnl_stemmer)

In [116]:
twitter_dataset.head()

Unnamed: 0,target,ids,date,flag,user,text,port_stemmed_text,wnl_stemmed_text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",zl twitpic http switchfoot david carr com thir...,switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,today might upset facebook text blah school up...,upset update facebook texting might cry result...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,bound ball rest time dive save manag go kenich...,kenichan dive many time ball manage save rest ...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,bodi like fire whole feel itchi,whole body feel itchy like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see,nationwideclass behave mad see


# 1) Using PorterStemmer

In [117]:
# Selecting x and y dataset
x = twitter_dataset['port_stemmed_text']
y = twitter_dataset['target']

print(x.shape, y.shape)

(1600000,) (1600000,)


In [118]:
# Splitting and x & y - train & test datasets.
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=2, test_size=0.2, stratify=y)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(1280000,) (320000,) (1280000,) (320000,)


In [119]:
print(x_train)

1570269                          iv lil wine drink saw watch
1273074                                         hatermagazin
88479      think wipe time gonna though drink favourit co...
254604                   today think hurt sun hand burnt got
667941     infect today find poor mazi babi took dr shot ...
                                 ...                        
941805                                       cheer threewink
1007131    smith night play live sure livewirerrrfm livew...
1460311                               monday eager afternoon
929226     everyon guy hope hear great mother store wait ...
526253                      folger voic wake deeper bad love
Name: port_stemmed_text, Length: 1280000, dtype: object


In [120]:
print(x_test)

131348     mmangen free chat back fine time twitter hubbi...
1142114         show geoffrey ah amp may sanhueza ruth w kim
244564                   dammit mayb ishatara bay area thang
445353                                 game lost stinkyy end
415893                                          cool brother
                                 ...                        
178459     drive wont twitter profil nut keep tri guess l...
1515130                                    teamqivana welcom
1449952    wonder member trip hooray nevertheless destini...
441063                                             feel well
1583304                                    supersandro thank
Name: port_stemmed_text, Length: 320000, dtype: object


Converting the text / sentence to vector form

In [121]:
vectorizer = TfidfVectorizer()

x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [122]:
print(x_train)

  (0, 185193)	0.5277679060576009
  (0, 235045)	0.41996827700291095
  (0, 443066)	0.4484755317023172
  (0, 109306)	0.3753708587402299
  (0, 354543)	0.3588091611460021
  (0, 436713)	0.27259876264838384
  (1, 160636)	1.0
  (2, 109306)	0.2625285236275761
  (2, 406399)	0.18358255316266456
  (2, 443430)	0.3829532339566025
  (2, 409143)	0.17347925398894998
  (2, 150715)	0.21504497702863656
  (2, 407301)	0.21396411814311606
  (2, 129411)	0.3324988719540792
  (2, 77929)	0.35777163813075397
  (2, 266729)	0.2758785793426386
  (2, 433560)	0.3770059680208907
  (2, 124484)	0.2163911233630537
  (2, 132311)	0.2320376578284552
  (2, 288470)	0.19197925311515698
  (2, 178061)	0.18515356216756648
  (3, 406399)	0.2902999123866228
  (3, 411528)	0.2708977244408787
  (3, 172421)	0.3746414692215438
  (3, 388626)	0.39407763314588456
  :	:
  (1279996, 318303)	0.21254698865277744
  (1279996, 434014)	0.27189450523324465
  (1279996, 390130)	0.2206474219107611
  (1279996, 373144)	0.35212500999832036
  (1279996, 2380

In [123]:
print(x_test)

  (0, 15110)	0.17868198150107975
  (0, 31168)	0.16885280831461277
  (0, 67828)	0.2785201532279686
  (0, 106069)	0.3798987676652852
  (0, 132364)	0.26527102786001805
  (0, 138164)	0.24617814953806813
  (0, 171378)	0.2915915735607113
  (0, 271016)	0.4713640654465935
  (0, 279082)	0.18524635737872527
  (0, 388348)	0.2284776498254667
  (0, 398906)	0.36280315655337814
  (0, 409143)	0.16331897218866984
  (0, 420984)	0.18618629168609485
  (1, 6463)	0.30733520460524466
  (1, 15110)	0.211037449588008
  (1, 145393)	0.575262969264869
  (1, 217562)	0.40288153995289894
  (1, 256777)	0.28751585696559306
  (1, 348135)	0.4739279595416274
  (1, 366203)	0.24595562404108307
  (2, 22532)	0.3532582957477176
  (2, 34401)	0.37916255084357414
  (2, 89448)	0.36340369428387626
  (2, 183312)	0.5892069252021465
  (2, 256834)	0.2564939661498776
  :	:
  (319994, 443794)	0.2782185641032538
  (319995, 107868)	0.33399349737546963
  (319995, 109379)	0.3020896484890833
  (319995, 155493)	0.2770682832971669
  (319995, 21

In [124]:
# Calling logistic regression model
port_log_model = LogisticRegression(max_iter=1000)
port_log_model.fit(x_train,y_train)

In [125]:
# calculating train accuracy score
y_train_pred = port_log_model.predict(x_train)
train_acc_score = accuracy_score(y_train, y_train_pred)
train_acc_score

0.7929875

In [126]:
# Calculating test accuracy score
y_test_pred = port_log_model.predict(x_test)
test_acc_score = accuracy_score(y_test, y_test_pred)
test_acc_score

0.7766

Compare both train and test accuracy score, there isn't much difference between the two. Suggest that no overfitting and the model is generalzing well.

# 2) Using WordNetLemmatizer



In [133]:
# Selecting x and y dataset
x = twitter_dataset['wnl_stemmed_text']
y = twitter_dataset['target']

print(x.shape, y.shape)

(1600000,) (1600000,)


In [134]:
# Splitting and x & y - train & test datasets.
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=2, test_size=0.2, stratify=y)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(1280000,) (320000,) (1280000,) (320000,)


In [135]:
print(x_train)

1570269                          watch saw iv drink lil wine
1273074                                        hatermagazine
88479      even though favourite drink think vodka coke w...
254604                    think hand get burn sun today hurt
667941     take mazie dr shots today come find ear infect...
                                 ...                        
941805                                      threewinks cheer
1007131    vote livewire play live smiths tomorrow night ...
1460311                               eager monday afternoon
929226     hope everyone mother great day wait hear guy s...
526253                    love wake folgers bad voice deeper
Name: wnl_stemmed_text, Length: 1280000, dtype: object


In [136]:
print(x_test)

131348     mmangen fine much time chat twitter hubby back...
1142114        ahs may show w ruth kim amp geoffrey sanhueza
244564                  ishatara maybe bay area thang dammit
445353                                 game end lose stinkyy
415893                                          cool brother
                                 ...                        
178459     twitter drive nut wont let download profile pi...
1515130                                   teamqivana welcome
1449952    destini nevertheless hooray members wonderful ...
441063                                             feel well
1583304                                    supersandro thank
Name: wnl_stemmed_text, Length: 320000, dtype: object


Converting the text / sentence to vector form

In [137]:
vectorizer = TfidfVectorizer()

x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [138]:
print(x_train)

  (0, 470745)	0.27381755687052417
  (0, 382072)	0.3603943177750534
  (0, 200994)	0.531602880967786
  (0, 118338)	0.3604928603616117
  (0, 253171)	0.42198267430355246
  (0, 477619)	0.45223199624483285
  (1, 174229)	1.0
  (2, 118338)	0.44575020219117023
  (2, 134738)	0.19229950123123435
  (2, 438613)	0.18998814465528252
  (2, 140319)	0.29861409377626263
  (2, 437668)	0.31179953808534727
  (2, 467332)	0.33566451083368337
  (2, 83890)	0.31779754592825116
  (2, 478000)	0.34003077384894803
  (2, 286956)	0.24504330237165792
  (2, 440597)	0.15410651656732774
  (2, 192900)	0.16443321733386596
  (2, 163442)	0.1909460532645921
  (2, 143521)	0.19027327281229714
  (2, 310126)	0.17046503677608277
  (3, 437668)	0.2930971933158944
  (3, 172101)	0.471326476729321
  (3, 158398)	0.2254794387395701
  (3, 60766)	0.47888730806838997
  :	:
  (1279996, 342416)	0.20414960074355934
  (1279996, 467839)	0.2611520470372954
  (1279996, 420378)	0.21327156411443027
  (1279996, 401981)	0.4271013996668542
  (1279996, 2

In [139]:
print(x_test)

  (0, 16250)	0.16996644434178768
  (0, 33607)	0.16057411475401528
  (0, 73066)	0.2646807931966187
  (0, 114865)	0.390655592067218
  (0, 143577)	0.2528275220196835
  (0, 149923)	0.23403390667127777
  (0, 185797)	0.27872069766129115
  (0, 291501)	0.4482512180719088
  (0, 300087)	0.1761655250647252
  (0, 418433)	0.2172860777508959
  (0, 429849)	0.3450134805912801
  (0, 440597)	0.3107653694754491
  (0, 453520)	0.17746743909064872
  (1, 7676)	0.5371044036829311
  (1, 16250)	0.18679764879462396
  (1, 157740)	0.509050504997605
  (1, 234619)	0.35825969336096525
  (1, 276381)	0.25452928565152644
  (1, 375299)	0.42087828098828034
  (1, 394544)	0.21730022724762102
  (2, 24218)	0.3553932768768115
  (2, 37141)	0.377709328107985
  (2, 96922)	0.362010870784943
  (2, 199025)	0.5869486618326093
  (2, 276448)	0.2561287058785863
  :	:
  (319994, 478404)	0.27653748913421006
  (319995, 116815)	0.33246564383959487
  (319995, 118419)	0.2958854697427526
  (319995, 168622)	0.27541253435850743
  (319995, 230167

In [140]:
# Calling logistic regression model
wnl_log_model = LogisticRegression(max_iter=1000)
wnl_log_model.fit(x_train,y_train)

In [142]:
# calculating train accuracy score
y_train_pred = wnl_log_model.predict(x_train)
train_acc_score = accuracy_score(y_train, y_train_pred)
train_acc_score

0.80063828125

In [143]:
# Calculating test accuracy score
y_test_pred = wnl_log_model.predict(x_test)
test_acc_score = accuracy_score(y_test, y_test_pred)
test_acc_score

0.77638125

Compare both train and test accuracy score, there isn't much difference between the two. Suggest that no overfitting and the model is generalzing well.

# After comparing two stemming types both produces the approx same accuarcy score.

Store the trained porter and wordnet logistic model in to the file. Instead of running the whole model again and again, dump or store it in a file and that file can be used directly for the unseen data.

Use pickle to do this.

In [127]:
# Import pickle for loading or dumping the model.
import pickle

In [128]:
# Dumping the model in the file name
file_name = 'Twitter_NLP_Port_Log.sav'
pickle.dump(port_log_model, open(file_name, 'wb'))

In [130]:
# Loading the trained model directly
loaded_port_log_model = pickle.load(open('/content/Twitter_NLP_Port_Log.sav', 'rb'))
loaded_port_log_model

In [131]:
# Checking the model performence on unseen data
x_new = x_test[100]
print(y_test[100])

prediction = loaded_port_log_model.predict(x_new)
print(prediction)

if prediction[0] == 0:
  print('Negative Tweet')
else:
  print('Positive_Tweet')

0
[0]
Negative Tweet


In [132]:
x_new = x_test[1]
print(y_test[1])

prediction = loaded_port_log_model.predict(x_new)
print(prediction)

if prediction[0] == 0:
  print('Negative Tweet')
else:
  print('Positive_Tweet')

0
[1]
Positive_Tweet
