In [4]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
import pickle

In [3]:
# Loading the dataset

col_names = ['target','id','date','flag','user','text']
df = pd.read_csv('/content/drive/MyDrive/training.1600000.processed.noemoticon.csv',encoding='ISO-8859-1',names=col_names)
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
df.shape

(1600000, 6)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   target  1600000 non-null  int64 
 1   id      1600000 non-null  int64 
 2   date    1600000 non-null  object
 3   flag    1600000 non-null  object
 4   user    1600000 non-null  object
 5   text    1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [7]:
# Checking null values

df.isna().sum()

Unnamed: 0,0
target,0
id,0
date,0
flag,0
user,0
text,0


In [8]:
# Checking for duplicates

df.duplicated().sum()

0

In [9]:
# Checking the distribution of 'target' column

df['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
4,800000


In [10]:
# Coverting label for 'positive(4)' tweet to 1

df['target'] = df['target'].apply(lambda x: 1 if x==4 else 0)

In [11]:
df['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
1,800000


0 - Negative Tweet

1 - Positive Tweet

In [14]:
# Downloading 'stopwords'

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
# Printing 'stopwords' in English

stopword = stopwords.words('english')
print(stopword)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# **Stemming**
It is a process of reducing a word to its Root word

Example - Running ---> Run


In [16]:
port_stem = PorterStemmer()

In [17]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ',content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if word not in stopword]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

In [18]:
df['stemmed_content'] = df['text'].apply(stemming)

In [19]:
df.head()

Unnamed: 0,target,id,date,flag,user,text,stemmed_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see


In [20]:
# Splitting the data into 'train' & 'test'

x_train, x_test, y_train, y_test = train_test_split(df['stemmed_content'],df['target'],test_size=0.2,stratify=df['target'],random_state=2)
x_train.shape, x_test.shape

((1280000,), (320000,))

In [21]:
print(x_train)

1570269                          watch saw iv drink lil wine
1273074                                         hatermagazin
88479      even though favourit drink think vodka coke wi...
254604                   think hand got burnt sun today hurt
667941     took mazi dr shot today come find ear infect p...
                                 ...                        
941805                                       threewink cheer
1007131    vote livewir play live smith tomorrow night su...
1460311                               eager monday afternoon
929226     hope everyon mother great day wait hear guy st...
526253                      love wake folger bad voic deeper
Name: stemmed_content, Length: 1280000, dtype: object


In [22]:
print(x_test)

131348     mmangen fine much time chat twitter hubbi back...
1142114         ah may show w ruth kim amp geoffrey sanhueza
244564                   ishatara mayb bay area thang dammit
445353                                 game end lost stinkyy
415893                                          cool brother
                                 ...                        
178459     twitter drive nut wont let download profil pic...
1515130                                    teamqivana welcom
1449952    destini nevertheless hooray member wonder safe...
441063                                             feel well
1583304                                    supersandro thank
Name: stemmed_content, Length: 320000, dtype: object


In [23]:
# Converting textual data to numerical data

vectorizer = TfidfVectorizer()

x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [24]:
print(x_train)

  (0, 443066)	0.4484755317023172
  (0, 235045)	0.41996827700291095
  (0, 109306)	0.3753708587402299
  (0, 185193)	0.5277679060576009
  (0, 354543)	0.3588091611460021
  (0, 436713)	0.27259876264838384
  (1, 160636)	1.0
  (2, 288470)	0.16786949597862733
  (2, 132311)	0.2028971570399794
  (2, 150715)	0.18803850583207948
  (2, 178061)	0.1619010109445149
  (2, 409143)	0.15169282335109835
  (2, 266729)	0.24123230668976975
  (2, 443430)	0.3348599670252845
  (2, 77929)	0.31284080750346344
  (2, 433560)	0.3296595898028565
  (2, 406399)	0.32105459490875526
  (2, 129411)	0.29074192727957143
  (2, 407301)	0.18709338684973031
  (2, 124484)	0.1892155960801415
  (2, 109306)	0.4591176413728317
  (3, 172421)	0.37464146922154384
  (3, 411528)	0.27089772444087873
  (3, 388626)	0.3940776331458846
  (3, 56476)	0.5200465453608686
  :	:
  (1279996, 390130)	0.22064742191076112
  (1279996, 434014)	0.2718945052332447
  (1279996, 318303)	0.21254698865277746
  (1279996, 237899)	0.2236567560099234
  (1279996, 2910

In [25]:
print(x_test)

  (0, 420984)	0.17915624523539803
  (0, 409143)	0.31430470598079707
  (0, 398906)	0.3491043873264267
  (0, 388348)	0.21985076072061738
  (0, 279082)	0.1782518010910344
  (0, 271016)	0.4535662391658828
  (0, 171378)	0.2805816206356073
  (0, 138164)	0.23688292264071403
  (0, 132364)	0.25525488955578596
  (0, 106069)	0.3655545001090455
  (0, 67828)	0.26800375270827315
  (0, 31168)	0.16247724180521766
  (0, 15110)	0.1719352837797837
  (1, 366203)	0.24595562404108307
  (1, 348135)	0.4739279595416274
  (1, 256777)	0.28751585696559306
  (1, 217562)	0.40288153995289894
  (1, 145393)	0.575262969264869
  (1, 15110)	0.211037449588008
  (1, 6463)	0.30733520460524466
  (2, 400621)	0.4317732461913093
  (2, 256834)	0.2564939661498776
  (2, 183312)	0.5892069252021465
  (2, 89448)	0.36340369428387626
  (2, 34401)	0.37916255084357414
  :	:
  (319994, 123278)	0.4530341382559843
  (319995, 444934)	0.3211092817599261
  (319995, 420984)	0.22631428606830145
  (319995, 416257)	0.23816465111736276
  (319995, 3

# **Model** **Training**

In [26]:
lr = LogisticRegression()

lr.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
# Prediction on 'test' data

y_pred = lr.predict(x_test)

In [28]:
# Checking accuracy score

accuracy_score(y_test,y_pred)

0.7781375

In [29]:
p = precision_score(y_pred, y_test,average='weighted')
r = recall_score(y_pred, y_test,average='weighted')
f = f1_score(y_pred, y_test,average='weighted')

p,r,f

(0.778845790703125, 0.7781375, 0.7782160994086762)

# **Pickling the Model**

In [30]:
import pickle

pickle.dump(lr,open('logistic_model.pkl','wb'))
pickled_model = pickle.load(open('logistic_model.pkl','rb'))

In [43]:
# Testing with Pickled Model

new_data = x_test[200]

print('Actual value: ',y_test.iloc[200])

prediction = pickled_model.predict(new_data)
print('Predicted value: ',prediction[0])

if prediction[0] == 0:
  print('Negative Tweet')
else:
  print('Positive Tweet')

Actual value:  1
Predicted value:  1
Positive Tweet
