In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer #TF-IDF(Term Frequency-Inverse Document Frequency)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
print(stopwords.words('english'))
#removing these words as they are not that importent

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [4]:
dataset = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1', header=None)
#(encoding='ISO-8859-1') tells Pandas how to read special characters (like é, ñ, etc.).
#ISO-8859-1 is a common encoding for text that isn't strictly standard UTF-8.

In [5]:
dataset.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [6]:
col_names = ['target','id','date','flag','user','text']
dataset.columns = col_names
dataset.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


Distibution b/w (-)ve and (+)ve ***tweets***

In [7]:
dataset['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
4,800000


In [8]:
#converting 0->0 and 4->1
dataset['target'] = dataset['target'].map({4:1,0:0})
dataset['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
1,800000


***STEMMING***

In [9]:
#Stemming means cutting words down to their root or base form.
#It's a text preprocessing technique.
#It removes prefixes and suffixes to find the "stem" of the word.
#The stem may not always be a real English word — it's just a base form used for analysis.

stremmer = PorterStemmer()

def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content) #replace everything that's not a-z or A-Z with a space.
    stemmed_content = stemmed_content.lower()  #Converts the entire text to lowercase.
    stemmed_content = stemmed_content.split()  #Splits the text into a list of words.
    #If the word is NOT a stopword (like "is", "the", "and", "a"...), stem it.
    stemmed_content = [stremmer.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content) #Combines the list of words back into a single string.
    return stemmed_content

In [10]:
dataset['text'] = dataset['text'].apply(stemming)

In [11]:
dataset.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,upset updat facebook text might cri result sch...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,kenichan dive mani time ball manag save rest g...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,nationwideclass behav mad see


In [12]:
x = dataset['text'].values
y = dataset['target'].values

In [31]:
x_train , x_test , y_train , y_test = train_test_split(x , y , test_size = 0.2 , random_state = 0)


Converting textual data to numerical ***data***

In [32]:
vectorizer = TfidfVectorizer()#This initializes a TF-IDF vectorizer

x_train = vectorizer.fit_transform(x_train)
#Fit: It learns the vocabulary and inverse document frequency (IDF) of the training data.
#Transform: It converts the text data into a sparse matrix of TF-IDF scores for the training set.

x_test = vectorizer.transform(x_test)
#ransforms the x_test data, based on the vocabulary and IDF values learned from the training data.

In [33]:
print(x_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9453992 stored elements and shape (1280000, 461607)>
  Coords	Values
  (0, 382787)	0.5218568045404242
  (0, 297310)	0.6232356327780383
  (0, 408010)	0.3371370780764098
  (0, 234856)	0.18185821646616165
  (0, 426790)	0.2679643799852714
  (0, 315998)	0.2756501589509843
  (0, 438902)	0.2114735969568524
  (1, 171471)	0.19809776536935508
  (1, 420893)	0.25656200686096986
  (1, 78784)	0.21269612474241953
  (1, 445810)	0.44918210875315057
  (1, 40016)	0.25216535956536873
  (1, 81860)	0.6901664688997386
  (1, 266187)	0.3286516093502419
  (2, 146267)	0.14182057590573882
  (2, 445822)	0.15086623694460702
  (2, 454730)	0.22122021129402739
  (2, 406451)	0.17049803826103221
  (2, 280634)	0.23782486392804178
  (2, 392426)	0.3146223146303898
  (2, 143199)	0.34048926092844567
  (2, 125793)	0.22415259519057731
  (2, 318192)	0.24073956626652676
  (2, 127779)	0.3654548377329762
  (2, 234605)	0.2813845455072357
  :	:
  (1279997, 376922)	0.17628

In [34]:
# Training the model
model = LogisticRegression()
model.fit(x_train , y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Testing the model

In [35]:
# Testing the model
y_pred = model.predict(x_test)
print(accuracy_score(y_test , y_pred))

0.77794375


In [36]:
# Function to predict the sentiment
def predict_sentiment(text):
    text = re.sub('[^a-zA-Z]',' ',text) # removing not a-z and A-Z
    text = text.lower()
    text = text.split()
    text = [stremmer.stem(word) for word in text if not word in stopwords.words('english')]
    text = ' '.join(text)
    text = [text]
    text = vectorizer.transform(text)
    sentiment = model.predict(text)
    if sentiment == 0:
        return "Negative"
    else:
        return "Positive"

In [37]:
# Testing the model
print(predict_sentiment("I hate you"))
print(predict_sentiment("I love you"))

Negative
Positive



Saving the ***Model***

In [38]:
import pickle
pickle.dump(model,open('mode.pkl','wb'))

In [39]:
pickle.dump(vectorizer , open('vectorizer.pkl' , 'wb'))