In [1]:
# Importing Dependencies
import numpy as np
import pandas as pd
import re 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# printing the stopwords
print(stopwords.words("english"))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

****Data Processing****


In [4]:

# loading the data from the csv file to pandas dataframe
twitter_data = pd.read_csv('/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv', encoding = 'ISO-8859-1')

In [5]:
# checking the number of rows and columns
twitter_data.shape

(1599999, 6)

In [6]:
# printing the first 5 rows of the dataframe
twitter_data.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [7]:
# naming the columns and reading the dataset again
column_names = ['target','id','date','flag','user','text']
twitter_data = pd.read_csv('/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv', names=column_names, encoding='ISO-8859-1')
twitter_data.shape
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [8]:
# check if missing values exist in dataset
twitter_data.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [9]:
# check the distribution on target column
twitter_data['target'].value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

In [10]:
# convert target '4' value to '1'
twitter_data.replace({'target':{4:1}}, inplace=True)

In [11]:
twitter_data['target'].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

****Stemming****

Stemming is the process of reducing a word to its root word

In [12]:
port_stem = PorterStemmer()

In [13]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ', content) # removing all things that is non-letter
    stemmed_content = stemmed_content.lower() # lowercasing the letters
    stemmed_content = stemmed_content.split() # split all the words in a tweet and put it on a list
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    # if it's a stopword, don't stem it, ignore it
        
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [14]:
twitter_data['stemmed_content'] = twitter_data['text'].apply(stemming)

In [15]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text,stemmed_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see


In [16]:
# seperating data and label
X = twitter_data['stemmed_content'].values
y = twitter_data['target'].values

****Train-test split****

In [17]:
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)

In [18]:
print(X_train)
print(X_test)

['rise shine lol min later plan rush door time'
 'nd interview today look promis' 'emilyalbracht feel pain' ...
 'bookwitt welcom chang mind though let know' 'howcoza bet bring backup'
 'window linux box instal bsd appl ipod yeah']
['would like even littl bit time work lay work'
 'look forward keep touch naomi long time sinc chat'
 'nbatvandr germani nba airtim got intern lp watch game sure' ...
 'sweet pea darn think journey like realli'
 'oh calvin harri call daft bastard feel cool new claim fame'
 'go famili meal hour cheer grandpa']


In [19]:
# converting text data into numerical data using vectorizer

vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)


In [20]:
print(X_train)
print(X_test)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9455726 stored elements and shape (1280000, 461471)>
  Coords	Values
  (0, 408994)	0.20327132937613104
  (0, 106816)	0.37134449965091226
  (0, 347931)	0.4123017384299885
  (0, 317916)	0.30350077343669557
  (0, 227941)	0.2912488589003264
  (0, 266342)	0.3358584555454906
  (0, 239610)	0.20930055750051507
  (0, 364818)	0.37627918401650134
  (0, 341505)	0.41911697481409166
  (1, 324566)	0.5346829742944466
  (1, 240357)	0.32757289154579433
  (1, 411415)	0.28912454600209647
  (1, 181329)	0.5247713457687215
  (1, 286004)	0.49782742921934126
  (2, 306993)	0.41244791725873703
  (2, 129982)	0.27792888797524024
  (2, 118467)	0.8675495656028976
  (3, 15081)	0.44595664917320677
  (3, 244939)	0.24365166869261953
  (3, 136560)	0.2030585102219242
  (3, 165909)	0.30014312059830567
  (3, 15061)	0.15048519407525865
  (3, 125919)	0.2975473828320111
  (3, 439081)	0.1893315187082751
  (3, 375838)	0.24345453849281473
  :	:
  (1279995, 162976)	0.50

****TRAINING THE ML MODEL****

In [21]:
model = LogisticRegression(max_iter=1000)

In [22]:
model.fit(X_train, y_train)

In [23]:
# Model Evaluation

X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(y_train, X_train_prediction)

In [24]:
print("Accuracy score (training data): ", training_data_accuracy)

Accuracy score (training data):  0.81002421875


In [25]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(y_test, X_test_prediction)

In [26]:
print("Accuracy score (test data): ", test_data_accuracy)

Accuracy score (test data):  0.778875


****Saving The Trained Model****

In [27]:
import pickle

In [28]:
filename = 'trained_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [29]:
with open("vectorizer.pkl", "wb") as file:
    pickle.dump(vectorizer, file)