In [10]:
import re
import pickle
import numpy as np
import pandas as pd

# plotting
import seaborn as sns
import matplotlib.pyplot as plt

# nltk
import nltk
from nltk.stem import WordNetLemmatizer

# sklearn
from sklearn.svm import LinearSVC
from sklearn import preprocessing
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report

In [11]:
## Defining set containing all stopwords in english.
stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from', 
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're',
             's', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
             'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
             "youve", 'your', 'yours', 'yourself', 'yourselves']

In [12]:
# Importing the dataset
DATASET_COLUMNS  = ["sentiment", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
dataset = pd.read_csv('training.1600000.processed.noemoticon.csv',encoding=DATASET_ENCODING , names=DATASET_COLUMNS)
print(dataset.head())

   sentiment         ids                          date      flag  \
0          0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY   
1          0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
2          0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
3          0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4          0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   

              user                                               text  
0  _TheSpecialOne_  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1    scotthamilton  is upset that he can't update his Facebook by ...  
2         mattycus  @Kenichan I dived many times for the ball. Man...  
3          ElleCTF    my whole body feels itchy and like its on fire   
4           Karoli  @nationwideclass no, it's not behaving at all....  


In [13]:
#number of rows and columns
dataset.shape

(1600000, 6)

In [14]:
#counting the number of missing values in the dataset
dataset.isnull().sum()

sentiment    0
ids          0
date         0
flag         0
user         0
text         0
dtype: int64

In [15]:
#checking the distribution of target column
dataset['sentiment'].value_counts()

sentiment
0    800000
4    800000
Name: count, dtype: int64

In [16]:
#converting the target 4 to 1
dataset.replace({'sentiment':{4:1}},inplace=True)

In [17]:
dataset['sentiment'].value_counts()

sentiment
0    800000
1    800000
Name: count, dtype: int64

In [18]:
from sklearn import preprocessing
def preprocess(textdata):
    processedText = []
    
    # Create Lemmatizer and Stemmer.
    wordLemm = WordNetLemmatizer()
    
    # Defining regex patterns.
    alphaPattern      = "[^a-zA-Z0-9]"
    
    for tweet in textdata:
        tweet = tweet.lower()        
        # Replace all non alphabets.
        tweet = re.sub(alphaPattern, " ", tweet)
        tweetwords = ''
        for word in tweet.split():
            # Checking if the word is a stopword.
            #if word not in stopwordlist:
            if len(word)>1:
                # Lemmatizing the word.
                word = wordLemm.lemmatize(word)
                tweetwords += (word+' ')
            
        processedText.append(tweetwords)
        
    return processedText

In [19]:
import time
t = time.time()
dataset['processedText']=dataset['text']
print(f'Text Preprocessing complete.')
print(f'Time Taken: {round(time.time()-t)} seconds')

Text Preprocessing complete.
Time Taken: 0 seconds


In [20]:
print(dataset['processedText'])

0          @switchfoot http://twitpic.com/2y1zl - Awww, t...
1          is upset that he can't update his Facebook by ...
2          @Kenichan I dived many times for the ball. Man...
3            my whole body feels itchy and like its on fire 
4          @nationwideclass no, it's not behaving at all....
                                 ...                        
1599995    Just woke up. Having no school is the best fee...
1599996    TheWDB.com - Very cool to hear old Walt interv...
1599997    Are you ready for your MoJo Makeover? Ask me f...
1599998    Happy 38th Birthday to my boo of alll time!!! ...
1599999    happy #charitytuesday @theNSPCC @SparksCharity...
Name: processedText, Length: 1600000, dtype: object


In [21]:
dataset['sentiment']

0          0
1          0
2          0
3          0
4          0
          ..
1599995    1
1599996    1
1599997    1
1599998    1
1599999    1
Name: sentiment, Length: 1600000, dtype: int64

In [22]:
#seperating data and labels
X=dataset['processedText'].values
y=dataset['sentiment'].values

In [23]:
print(X)

["@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"
 "is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!"
 '@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds'
 ... 'Are you ready for your MoJo Makeover? Ask me for details '
 'Happy 38th Birthday to my boo of alll time!!! Tupac Amaru Shakur '
 'happy #charitytuesday @theNSPCC @SparksCharity @SpeakingUpH4H ']


In [24]:
print(y)

[0 0 0 ... 1 1 1]


In [25]:
#splitting the data into training and test data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,stratify=y,random_state=2)

In [26]:
print(X.shape,X_train.shape,X_test.shape)

(1600000,) (1280000,) (320000,)


In [27]:
print(X_train)

['about to watch saw iv and drink a lil wine ' "@HaterMagazine I'm in! "
 'even though its my favourite drink i think its the vodka and coke that wipes my mind all the time  think im gonna have to find a new drink'
 ... 'is eager for Monday afternoon '
 "Hope everyone and their mother had a great day!  Can't wait to hear what the guys have in store tomorrow!"
 'I love waking up to Folgers. Too bad my voice was deeper than his. ']


In [28]:
print(X_test)

["@mmangen - M doing fine! I haven't had much time to chat on Twitter.   Hubby is back for the summer &amp; tends to dominate my free time. "
 'at AHS may show w/  ruth kim &amp; geoffrey sanhueza '
 '@iShatara maybe it was only a bay area thang  dammit!' ...
 '@Destini41 Nevertheless Hooray! for 4700 members and have a wonderful and safe trip! '
 'Not feeling too well ' '@supersandro !!!! thank you! ']


In [29]:
#converting the text data into numerical data
vectorizer=TfidfVectorizer()
X_train=vectorizer.fit_transform(X_train)
X_test=vectorizer.transform(X_test)

In [30]:
print(X_train)

  (0, 566317)	0.4123398570437823
  (0, 317443)	0.38662946862459446
  (0, 169798)	0.3832112649811927
  (0, 57473)	0.14918233689170618
  (0, 259193)	0.48840858232447565
  (0, 461482)	0.3283935062719776
  (0, 558927)	0.3035607951325368
  (0, 528767)	0.12112707573781936
  (0, 40489)	0.24120756614816113
  (1, 251985)	0.21549449573925625
  (1, 230652)	0.9765050549311477
  (2, 381766)	0.14309833061129923
  (2, 197379)	0.17623751548259767
  (2, 230894)	0.10572438592439302
  (2, 218911)	0.1602635230660109
  (2, 250499)	0.13809309767160746
  (2, 526358)	0.13335422005784348
  (2, 51554)	0.12235906051490689
  (2, 355465)	0.20855733075890523
  (2, 566733)	0.35363328525903653
  (2, 517020)	0.10231390424363243
  (2, 131148)	0.2681373240012151
  (2, 555192)	0.28172473249149066
  (2, 517424)	0.142270305894794
  (2, 523182)	0.2864542762158459
  :	:
  (1279998, 188065)	0.26293790767268965
  (1279998, 562615)	0.2034240179077971
  (1279998, 240750)	0.22914088411240496
  (1279998, 226659)	0.2132662068923897

In [31]:
print(X_test)

  (0, 540515)	0.15792536311962382
  (0, 528767)	0.14519551210814183
  (0, 526358)	0.28061695099206196
  (0, 517424)	0.07484476201725739
  (0, 514780)	0.3602875215505537
  (0, 501895)	0.19100503431056134
  (0, 397223)	0.10689068511219833
  (0, 372800)	0.08816083516534921
  (0, 370285)	0.15432151848221323
  (0, 360633)	0.39258757628101915
  (0, 256825)	0.09613533994338157
  (0, 242946)	0.2441272357749445
  (0, 230930)	0.2017124268503555
  (0, 226659)	0.14620950509390615
  (0, 204417)	0.20529869009944648
  (0, 201909)	0.09906588316358035
  (0, 197441)	0.22153830924032758
  (0, 165924)	0.37505416545454745
  (0, 165538)	0.18104096237371695
  (0, 119150)	0.24248183645807
  (0, 75487)	0.1409000436500198
  (0, 56126)	0.14887694559244066
  (1, 475256)	0.2211696635570984
  (1, 453619)	0.4226272895568759
  (1, 343759)	0.24804295012372016
  :	:
  (319995, 169929)	0.2995093966628634
  (319995, 168058)	0.3153038756830633
  (319996, 581874)	0.1683272523939486
  (319996, 561279)	0.3736251575483514
  (

In [32]:
#Training the machine learing model
#logistic regression
from sklearn.linear_model import LogisticRegression
model=LogisticRegression(max_iter=1000)

In [33]:
model.fit(X_train,y_train)

In [34]:
#predicting the test set results
y_pred=model.predict(X_test)
print("predicted values",y_pred)

predicted values [1 1 0 ... 1 0 1]


In [35]:
#printing confusion matrix
cm=confusion_matrix(y_test,y_pred)
print(cm)

[[127188  32812]
 [ 30183 129817]]


In [36]:
#model evaluation
#accuracy score on the training data
from sklearn.metrics import accuracy_score
X_train_pred=model.predict(X_train)
train_data_accuracy=accuracy_score(y_train,X_train_pred)

In [37]:
print("accuracy score on the training data:",train_data_accuracy)

accuracy score on the training data: 0.8302109375


In [38]:
#accuracy score on the testing data
from sklearn.metrics import accuracy_score
X_test_pred=model.predict(X_test)
test_data_accuracy=accuracy_score(y_test,X_test_pred)

In [39]:
print("accuracy score on the testing data:",test_data_accuracy)

accuracy score on the testing data: 0.803140625


In [40]:
#saving the trained model
import pickle

In [None]:
filename="trained_model.sav"
pickle.dump(model,open(filename,'wb'))

In [41]:
#using the model for future predictions
#loading the saved model
loaded_model=pickle.load(open('trained_model.sav','rb'))

In [42]:
#predicting a new data
X_new=X_test[400]
print(y_test[400])
prediction=loaded_model.predict(X_new)
print(prediction)
if(prediction[0]==0):
    print("negative tweet")
else:
    print("positive tweet")

1
[1]
positive tweet
