In [1]:
import numpy as np
import pandas as pd
import re,string  #regular expression or RegEx in python
import matplotlib.pyplot as plt
from nltk.corpus import stopwords  #Natural Language ToolKit
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
import pickle
import nltk
nltk.download('stopwords')
%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SU3ARNA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv('Twitter_Data.csv', encoding = 'latin1')
df = df.sample(frac = 1)
df

Unnamed: 0,clean_text,category
147926,has fatwa been pronounced against narendra modi,0.0
73049,when people were giving credit modi like congr...,1.0
107856,exclusive interview please watch reasons why m...,0.0
137685,lets extrapolate this modi govt can hide such ...,1.0
136114,modi should get the credit for removing the mi...,1.0
...,...,...
78334,modi putin interacting with students delhi sou...,0.0
4357,you may think have lot hatred for bjp modi\nbu...,0.0
124399,narendra modi run away from studio during inte...,0.0
60456,when modi started make the announcement though...,0.0


In [3]:
#stop words
nltk.download('stopwords')
words = stopwords.words("english")
print(words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SU3ARNA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def preprocess_text(text):
    
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
 
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

In [5]:
df

Unnamed: 0,clean_text,category
147926,has fatwa been pronounced against narendra modi,0.0
73049,when people were giving credit modi like congr...,1.0
107856,exclusive interview please watch reasons why m...,0.0
137685,lets extrapolate this modi govt can hide such ...,1.0
136114,modi should get the credit for removing the mi...,1.0
...,...,...
78334,modi putin interacting with students delhi sou...,0.0
4357,you may think have lot hatred for bjp modi\nbu...,0.0
124399,narendra modi run away from studio during inte...,0.0
60456,when modi started make the announcement though...,0.0


In [6]:
print("Missing values in  Dataset:")
print(df.isnull().sum())

Missing values in  Dataset:
clean_text    4
category      7
dtype: int64


In [7]:
df.category.nunique(), df.category.unique()

(3, array([ 0.,  1., -1., nan]))

In [8]:
df = df.dropna()
df.isnull().sum()

clean_text    0
category      0
dtype: int64

In [9]:
df.shape

(162969, 2)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 162969 entries, 147926 to 36287
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   clean_text  162969 non-null  object 
 1   category    162969 non-null  float64
dtypes: float64(1), object(1)
memory usage: 3.7+ MB


In [11]:
data = df['clean_text'][0].split()
for i in words:
  if i in data:
    c = data.count(i)
    for j in range(c):
      data.remove(i)
" ".join(data)


'modi promised â\x80\x9cminimum government maximum governanceâ\x80\x9d expected begin difficult job reforming state take years get justice state business exit psus temples'

In [12]:
# Data Cleaning using regex
regs = re.sub("[^a-zA-Z]", " ", df['clean_text'][126]).lower()
regs

'used bjp money visit japan singapore usa for yrs and left bjp when modi asked how tdp spend bjp moneyexp details yrs political experience use increased pension dwacra yuvanestam schemes just before elections win votes'

In [13]:
# Data Cleaning using stemmer

stemmer = PorterStemmer()
data = df['clean_text'][0].split()
print(data)
"".join([stemmer.stem(i) for i in df['clean_text'][0]])
# stemmer.stem("")

['when', 'modi', 'promised', 'â\x80\x9cminimum', 'government', 'maximum', 'governanceâ\x80\x9d', 'expected', 'him', 'begin', 'the', 'difficult', 'job', 'reforming', 'the', 'state', 'why', 'does', 'take', 'years', 'get', 'justice', 'state', 'should', 'and', 'not', 'business', 'and', 'should', 'exit', 'psus', 'and', 'temples']


'when modi promised â\x80\x9cminimum government maximum governanceâ\x80\x9d expected him begin the difficult job reforming the state why does take years get justice state should and not business and should exit psus and temples'

In [14]:
# Doing all cleaning process using regex, stemmer, stopwords for all data
df['cleaned'] = list(map(lambda x: " ".join([i for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]),df['clean_text']))
df['cleaned'] = df['cleaned'].apply(lambda x: " ".join([stemmer.stem(i) for i in x.lower().split()]))
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned'] = list(map(lambda x: " ".join([i for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]),df['clean_text']))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned'] = df['cleaned'].apply(lambda x: " ".join([stemmer.stem(i) for i in x.lower().split()]))


Unnamed: 0,clean_text,category,cleaned
147926,has fatwa been pronounced against narendra modi,0.0,fatwa pronounc narendra modi
73049,when people were giving credit modi like congr...,1.0,peopl give credit modi like congratul isro she...
107856,exclusive interview please watch reasons why m...,0.0,exclus interview pleas watch reason modi ralli...
137685,lets extrapolate this modi govt can hide such ...,1.0,let extrapol modi govt hide import news nation...
136114,modi should get the credit for removing the mi...,1.0,modi get credit remov middlemen involv corrupt...
...,...,...,...
78334,modi putin interacting with students delhi sou...,0.0,modi putin interact student delhi sourc youtub...
4357,you may think have lot hatred for bjp modi\nbu...,0.0,may think lot hatr bjp modi truth grate bjp rs...
124399,narendra modi run away from studio during inte...,0.0,narendra modi run away studio interview karan ...
60456,when modi started make the announcement though...,0.0,modi start make announc thought go say contact...


In [20]:
# Training model
from sklearn.linear_model import LogisticRegression
log_regression = LogisticRegression()

vectorizer = TfidfVectorizer()
X = df['cleaned']
Y = df['category']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.15) #Splitting dataset

 #Creating Pipeline
pipeline = Pipeline([('vect', vectorizer),
                     ('chi',  SelectKBest(chi2, k=2000)),
                     ('clf', LogisticRegression(random_state=1, max_iter=1000))])




# #Training model
model = pipeline.fit(X_train, y_train)



In [21]:
print(X_test,y_test)

127689    true arfa that ppl india want modi chair rathe...
18411     remind modi promiseonli arvind kejriw make lif...
77519     wait everi speech honor prime minist sri naren...
40127     modi modi india fool think modi india dont let...
104136    goodh kick bjp much earlierh modi wavebut time...
                                ...                        
130407                          modi enough lie support bjp
91275                 percent peopl said modi investig asap
107564          may modi win articl ram templ decis complet
95426     even delet frnd mine rahul gandhi bhaktsmost f...
152492    funni modi gujarat record victori chose varana...
Name: cleaned, Length: 24446, dtype: object 127689    1.0
18411     1.0
77519     0.0
40127     0.0
104136    1.0
         ... 
130407    0.0
91275     0.0
107564    1.0
95426     1.0
152492    1.0
Name: category, Length: 24446, dtype: float64


In [22]:
#Accuracy
from sklearn.metrics import accuracy_score
predict_news_cat = model.predict(X_test)
print(accuracy_score(y_test,predict_news_cat))


0.8475006135973165


In [23]:
#Accuracy
from sklearn.metrics import accuracy_score
predict_test_news_cat = model.predict(X_test)
predict_train_news_cat = model.predict(X_train)
print("Test accuracy = ",accuracy_score(y_test,predict_test_news_cat))
print("Train accuracy = ",accuracy_score(y_train,predict_train_news_cat))
print('\n')

Test accuracy =  0.8475006135973165
Train accuracy =  0.8495701074911748




In [19]:
tweets = input("Enter tweets = ")
tweets_data = {'predict_tweets':[tweets]}
tweets_data_df = pd.DataFrame(tweets_data)

predict_tweets_cat = model.predict(tweets_data_df['predict_tweets'])
print("Predicted tweets type = ",predict_tweets_cat[0])

Enter tweets = i am sad
Predicted tweets type =  -1.0
