In [45]:
%pip install kaggle
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [46]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import nltk

In [47]:
nltk.download('stopwords')
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pheonix/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [49]:
# Load the data with correct column names
column_names = ['target','id','date','flag','user','text']
twitter_data = pd.read_csv('Twitter SA.csv', names=column_names, encoding='ISO-8859-1')

In [50]:
# Check data shape and head
print(twitter_data.shape)
print(twitter_data.head())

(1600000, 6)
   target          id                          date      flag  \
0       0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY   
1       0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
2       0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
3       0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4       0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   

              user                                               text  
0  _TheSpecialOne_  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1    scotthamilton  is upset that he can't update his Facebook by ...  
2         mattycus  @Kenichan I dived many times for the ball. Man...  
3          ElleCTF    my whole body feels itchy and like its on fire   
4           Karoli  @nationwideclass no, it's not behaving at all....  


In [51]:
# Count missing values
print(twitter_data.isnull().sum())

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64


In [52]:
# Replace target values
twitter_data.replace({'target':{4:1}}, inplace=True)
print(twitter_data['target'].value_counts())

target
0    800000
1    800000
Name: count, dtype: int64


In [53]:
# Initialize Porter Stemmer
port_stem = PorterStemmer()

In [54]:
# Define stemming function
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    words = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in words if word not in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [55]:
# Apply stemming to text column
twitter_data['stemmed_content'] = twitter_data['text'].apply(stemming)
print(twitter_data['stemmed_content'].head())

0    switchfoot http twitpic com zl awww bummer sho...
1    upset updat facebook text might cri result sch...
2    kenichan dive mani time ball manag save rest g...
3                      whole bodi feel itchi like fire
4                        nationwideclass behav mad see
Name: stemmed_content, dtype: object


In [57]:
twitter_data.head()
print(twitter_data['stemmed_content'])
# seperating the data and the labels
X = twitter_data['stemmed_content'].values
print(X)
print(Y)
Y = twitter_data['target'].values

0          switchfoot http twitpic com zl awww bummer sho...
1          upset updat facebook text might cri result sch...
2          kenichan dive mani time ball manag save rest g...
3                            whole bodi feel itchi like fire
4                              nationwideclass behav mad see
                                 ...                        
1599995                           woke school best feel ever
1599996    thewdb com cool hear old walt interview http b...
1599997                         readi mojo makeov ask detail
1599998    happi th birthday boo alll time tupac amaru sh...
1599999    happi charitytuesday thenspcc sparkschar speak...
Name: stemmed_content, Length: 1600000, dtype: object
['switchfoot http twitpic com zl awww bummer shoulda got david carr third day'
 'upset updat facebook text might cri result school today also blah'
 'kenichan dive mani time ball manag save rest go bound' ...
 'readi mojo makeov ask detail'
 'happi th birthday boo alll time 

In [59]:
X_train ,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.2,stratify=Y,random_state = 69)
print(X.shape,X_train.shape,X_test.shape)

(1600000,) (1280000,) (320000,)


In [60]:
# convert the textual data to numerical data

vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)


In [61]:
# Training the regression model

model = LogisticRegression(max_iter=999)
model.fit(X_train,Y_train)

In [65]:
X_train_predict = model.predict(X_train)
accuracy = accuracy_score(Y_train,X_train_predict)
print('Accuracy on the training data : ',accuracy)


Accuracy on the training data :  0.79655390625


In [67]:
X_test_predict = model.predict(X_test)
accuracy = accuracy_score(Y_test,X_test_predict)
print('Accuracy on the test data : ',accuracy)


Accuracy on the test data :  0.776803125
