## Introduction
This project aims to analyze the sentiment of the people on this new law by studying the messages tweeted by them on this topic. All the messages tweeted with the hashtag of CAA, Citizenship Amendment Act etc are extracted using the Twitter APIs and used as input dataset for this project. These messages are then classified as positive, to indicate that they are supportive of the bill, negative, to indicate that they do not endorse the new law, and neutral. This classification is made by training a model, which is based on the Naive Bayes classifier algorithm, on a sample dataset whose classification is known beforehand. The model classifies an input record by calculating probability of the record being positive (p), negative (ng) and neutral (ne) and if p > ng and p > ne then it classifies the record as positive, if ng > p and ng > ne then it classifies as negative and otherwise it classifies as neutral. The model is applied to the extracted twitter dataset to classify them as positive, negative or neutral and a pie chart is drawn to present the analysis made on the input dataset. The analysis will show the general mood or opinion of the people expressed on this new amendment moved forward by the government of India.

## Importing Required Libraries

In [None]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np 
import os 
import pandas as pd 


## Required as we are using Kaggle notebook

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
# Distribution graphs (histogram/bar graph) of column data
def plotPerColumnDistribution(df, nGraphShown, nGraphPerRow):
    nunique = df.nunique()
    df = df[[col for col in df if nunique[col] > 1 and nunique[col] < 50]] # For displaying purposes, pick columns that have between 1 and 50 unique values
    nRow, nCol = df.shape
    columnNames = list(df)
    nGraphRow = (nCol + nGraphPerRow - 1) / nGraphPerRow
    plt.figure(num = None, figsize = (6 * nGraphPerRow, 8 * nGraphRow), dpi = 80, facecolor = 'w', edgecolor = 'k')
    for i in range(min(nCol, nGraphShown)):
        plt.subplot(nGraphRow, nGraphPerRow, i + 1)
        columnDf = df.iloc[:, i]
        if (not np.issubdtype(type(columnDf.iloc[0]), np.number)):
            valueCounts = columnDf.value_counts()
            valueCounts.plot.bar()
        else:
            columnDf.hist()
        plt.ylabel('counts')
        plt.xticks(rotation = 90)
        plt.title(f'{columnNames[i]} (column {i})')
    plt.tight_layout(pad = 1.0, w_pad = 1.0, h_pad = 1.0)
    plt.show()


In [None]:
# Correlation matrix
def plotCorrelationMatrix(df, graphWidth):
    filename = df.dataframeName
    df = df.dropna('columns') # drop columns with NaN
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    if df.shape[1] < 2:
        print(f'No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')
        return
    corr = df.corr()
    plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')
    corrMat = plt.matshow(corr, fignum = 1)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.gca().xaxis.tick_bottom()
    plt.colorbar(corrMat)
    plt.title(f'Correlation Matrix for {filename}', fontsize=15)
    plt.show()


In [None]:
# Scatter and density plots
def plotScatterMatrix(df, plotSize, textSize):
    df = df.select_dtypes(include =[np.number]) # keep only numerical columns
    # Remove rows and columns that would lead to df being singular
    df = df.dropna('columns')
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    columnNames = list(df)
    if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots
        columnNames = columnNames[:10]
    df = df[columnNames]
    ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='kde')
    corrs = df.corr().values
    for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
        ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize)
    plt.suptitle('Scatter and Density Plot')
    plt.show()


### 1. Data Exploration and Visualization

In [None]:
# specify 'None' if want to read whole file. For now we have considered 30000 data to build the model
nRowsRead = 30000 
# file.csv may have more rows in reality, but we are only loading/previewing the first 30000 rows
df1 = pd.read_csv('../input/caa-tweets-till-9012020/file.csv', delimiter=',',nrows = nRowsRead)
df1.dataframeName = 'file.csv'
nRow, nCol = df1.shape
print(f'There are {nRow} rows and {nCol} columns')

Let's take a quick look at what the data looks like:

In [None]:
df1.describe()

Distribution graphs (histogram/bar graph) of sampled columns:

In [None]:
df_plot = df1.head(nRowsRead)
plotPerColumnDistribution(df_plot, 10, 5)

In [None]:
df_plot = df1.head(20000)
type(df_plot)

Correlation matrix:

In [None]:
plotCorrelationMatrix(df1, 8)

Scatter and density plots:

In [None]:
plotScatterMatrix(df_plot, 20, 10)

## 2. Data Pre-processing
**Dropping duplicate rows**

There are two types of duplicates-duplicates with same values for all columns(this duplication happens when same tweets are collected again by tweet-collector) and duplicates with the same text for tweets(This occurs when two or more users post the same tweet.)

In [None]:
print(len(df1.index))
serlis=df1.duplicated().tolist()
print(serlis.count(True))
serlis=df1.duplicated(['tweet']).tolist()
print(serlis.count(True))

From the above results we can see that there are about '2819' records that are duplicated and we had to remove the duplicated rows as it was resulting in overfitting

In [None]:
df1.head(205)

## 2.1 Reindexing

As the deletion of duplicate rows causes in irregular indexing. 

In [None]:
#df=df1.drop_duplicates(['tweet']).reset_index()
df= df1[~df1.index.duplicated(keep='first')]

In [None]:
print(df.columns)

These are the features available in the data set but we do not need all of these features to perform a sentimental analysis. So we are exploring the features which are 'NaN' and the features which actually have the data

In [None]:
print(df['id'].isna().sum())
print(df['conversation_id'].isna().sum())
print(df['created_at'].isna().sum())
print(df['date'].isna().sum())
print(df['time'].isna().sum())
print(df['timezone'].isna().sum())
print(df['user_id'].isna().sum())
print(df['username'].isna().sum())
print(df['name'].isna().sum())#9
print(df['retweet'].isna().sum())
print(df['tweet'].isna().sum())
print(df['mentions'].isna().sum())
print(df['urls'].isna().sum())
print(df['photos'].isna().sum())
print(df['replies_count'].isna().sum())
print(df['retweets_count'].isna().sum())
print(df['likes_count'].isna().sum())
print(df['hashtags'].isna().sum())
print(df['cashtags'].isna().sum())
print(df['reply_to'].isna().sum())
print(df['video'].isna().sum())




In [None]:
print(df['near'].isna().sum())
print(df['geo'].isna().sum())
print(df['source'].isna().sum())
print(df['user_rt_id'].isna().sum())
print(df['user_rt'].isna().sum())
print(df['retweet_id'].isna().sum())
print(df['retweet_date'].isna().sum())
print(df['translate'].isna().sum())
print(df['trans_src'].isna().sum())
print(df['trans_dest'].isna().sum())
print(df['place'].isna().sum())
print(df['quote_url'].isna().sum())


We see that all the above columns have almost '30000' rows which are 'Nan' and we need to drop these features

## 2.2 Dropping unnecessary columns

Features like 'near','geo','source','user_rt_id','user_rt','retweet_id','retweet_date','translate','trans_src',           'trans_dest','place','quote_url','urls','link','id','conversation_id','user_id','photos','video','hashtags','cashtags' etc have more not-a-number(NaN) as their values.

In [None]:
df=df.drop(['near','geo','source','user_rt_id','user_rt','retweet_id','retweet_date','translate','trans_src',
            'trans_dest','place','quote_url','urls','link','id','conversation_id','user_id','photos','video','hashtags','cashtags'],axis=1)


We are also dropping other columns like 'id','conversation_id','user_id' as these features are unique and will not help us to perform a sentimental analysis

In [None]:
df.head(5)

## 2.3 Cleaning the tweets

In the text of a tweet, there may be some unnecessary symbols which is not essential for our analysis.Lets explore by printing a tweet

In [None]:
df['tweet'][2]

As we can see, some of the unnecessary text and symbols to be removed are — username_tags(like @News18Rajasthan,@zeerajasthan_), retweet symbol(RT), hashtags(like #CAARally, #CAA_NRC_Protests), URLs(like pic.twitter.com/6CJirGJ70o),numbers and punctuations .Some of the meaningful hashtags convey meaning and can have some sentiment in it after the word is segmented into useful parts (like #No #CAA). So, instead of removing all the words starting with hashtag symbols, only ‘#’ symbols are removed. We perform this text cleaning using re module in python. The re.sub() function searches for a pattern and replaces with the text we specify. We replace all these symbols with a whitespace character.

In [None]:
df.head()

In [None]:
import re
for i in range(len(df)):
    txt = df.loc[i]["tweet"]
    txt = re.sub('pic.twitter.com/[A-Za-z0-9./]+','',txt)
    txt=re.sub(r'@[A-Z0-9a-z_:]+','',txt)#replace username-tags
    txt=re.sub(r'^[RT]+','',txt)#replace RT-tags
    txt = re.sub('https?://[A-Za-z0-9./]+','',txt)#replace URLs
    txt=re.sub("[^a-zA-Z]", " ",txt)#replace hashtags
    df.at[i,"tweet"]=txt

Now, we can see that our tweets appear clean.

This tweet is before the pre-processing

In [None]:
df1['tweet'][2]

This is after cleaning the twitter data

In [None]:
df['tweet'][2]


## 3. Sentimental Analysis

## 3.1 POS-Tagging and Sentiment labeling

We are done with the basic cleaning part of text data.SentiWordNet is an enhanced lexical resource explicitly devised for supporting sentiment classification and opinion mining applications. It has a large corpus of POS-tagged English words along with their sentiment.

In [None]:
import time
import nltk
from nltk.corpus import sentiwordnet as swn
from nltk.tag import pos_tag,map_tag
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

pstem = PorterStemmer()
lem = WordNetLemmatizer()
stop_words = stopwords.words('english')
def pos_senti(df_copy):#takes
    li_swn=[]
    li_swn_pos=[]
    li_swn_neg=[]
    missing_words=[]
    for i in range(len(df_copy.index)):
        text = df_copy.loc[i]['tweet']
        tokens = nltk.word_tokenize(text)
        tagged_sent = pos_tag(tokens)
        store_it = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in tagged_sent]
        #print("Tagged Parts of Speech:",store_it)

        pos_total=0
        neg_total=0
        for word,tag in store_it:
            if(tag=='NOUN'):
                tag='n'
            elif(tag=='VERB'):
                tag='v'
            elif(tag=='ADJ'):
                tag='a'
            elif(tag=='ADV'):
                tag = 'r'
            else:
                tag='nothing'

            if(tag!='nothing'):
                concat = word+'.'+tag+'.01'
                try:
                    this_word_pos=swn.senti_synset(concat).pos_score()
                    this_word_neg=swn.senti_synset(concat).neg_score()
                    #print(word,tag,':',this_word_pos,this_word_neg)
                except Exception as e:
                    wor = lem.lemmatize(word)
                    concat = wor+'.'+tag+'.01'
                    # Checking if there's a possiblity of lemmatized word be accepted into SWN corpus
                    try:
                        this_word_pos=swn.senti_synset(concat).pos_score()
                        this_word_neg=swn.senti_synset(concat).neg_score()
                    except Exception as e:
                        wor = pstem.stem(word)
                        concat = wor+'.'+tag+'.01'
                        # Checking if there's a possiblity of lemmatized word be accepted
                        try:
                            this_word_pos=swn.senti_synset(concat).pos_score()
                            this_word_neg=swn.senti_synset(concat).neg_score()
                        except:
                            missing_words.append(word)
                            continue
                pos_total+=this_word_pos
                neg_total+=this_word_neg
        li_swn_pos.append(pos_total)
        li_swn_neg.append(neg_total)

        if(pos_total!=0 or neg_total!=0):
            if(pos_total>neg_total):
                li_swn.append(1)
            else:
                li_swn.append(-1)
        else:
            li_swn.append(0)
    df_copy.insert(5,"pos_score",li_swn_pos,True)
    df_copy.insert(6,"neg_score",li_swn_neg,True)
    df_copy.insert(7,"sent_score",li_swn,True)
    return df_copy

In [None]:
st=time.time()
print('Start time: ' + str(st))
df_copy = pos_senti(df)
end=time.time()
print('End time: ' + str(st))

In [None]:
df_copy.head()

In [None]:
type(df_copy['sent_score'][0])

## 4. Results by using SentiWordNet

SWN gives pos_score and neg_score for each word as in the above case. The higher the pos_score, the more positive is the word. We use a simple linear summation of these scores(We add pos_score of all the words in a tweet to form a pos_total and in a similar way, we obtain neg_total. Then we add these two to obtain sent_total) and label a sentence as positive(1) if it(sent_total) is greater than 0, negative(-1) if it is less than 0 and neutral(0)

In [None]:
count_pos_sen=0
count_neg_sen=0
count_neut_sen=0

for i in range(len(df_copy.index)):
    if df_copy['sent_score'][i] >0:
        count_pos_sen = count_pos_sen +1
    elif df_copy['sent_score'][i] == 0:
        count_neut_sen = count_neut_sen +1
    else:
        count_neg_sen = count_neg_sen +1

print("positive tweets:",count_pos_sen)
print("negative tweets:",count_neg_sen)
print("neutral tweets:",count_neut_sen)

## 5. Results by using TextBlob

TextBlob gives pos_score and neg_score for each word as in the above case.

In [None]:
#TextBlob SENTIMENT LABELING
from textblob import TextBlob
count_total=0
count_pos=0
count_neg=0
count_neut=0

li_tb = []
for i in range(len(df.index)):
    sent = TextBlob(str(df.loc[i]['tweet']))
    if(sent.sentiment.polarity>0):
        count_pos=count_pos+1
        count_total=count_total+1
        li_tb.append(1)
    elif(sent.sentiment.polarity<0):
        count_neg=count_neg+1
        count_total=count_total+1
        li_tb.append(-1)
    else:
        li_tb.append(0)
        count_neut+=1

        count_total=count_total+1


#         print(df.loc[i]['full_text'])
#         print(sent.sentiment)
print("Total tweets:",len(df.index))
print("Total tweets with sentiment:",count_total)
print("positive tweets:",count_pos)
print("negative tweets:",count_neg)
print("neutral tweets:",count_neut)


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# GENERATE POSITIVE TEXT,NEGATIVE TEXT,NEUTRAL TEXT FOR DATA-VISUALISATION
pos_text=""
neg_text=""
neut_text=""

for i in range(len(df_copy.index)):
    if(df_copy.loc[i]["sent_score"]==1):
        pos_text+=df_copy.loc[i]["tweet"]
    elif(df_copy.loc[i]["sent_score"]==-1):
        neg_text+=df_copy.loc[i]["tweet"]
    else:
        neut_text+=df_copy.loc[i]["tweet"]

list_text = [pos_text,neg_text,neut_text]


for txt in list_text:
    word_cloud = WordCloud(width = 600,height = 600,max_font_size = 200).generate(txt)
    plt.figure(figsize=(12,10))# create a new figure
    plt.imshow(word_cloud,interpolation="bilinear")
    plt.axis("off")
    plt.show()


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# GENERATE POSITIVE TEXT,NEGATIVE TEXT,NEUTRAL TEXT FOR DATA-VISUALISATION
pos_text=""
neg_text=""
neut_text=""

for i in range(len(df.index)):
    if(df.loc[i]["sent_score"]==1):
        pos_text+=df.loc[i]["tweet"]
    elif(df.loc[i]["sent_score"]==-1):
        neg_text+=df.loc[i]["tweet"]
    else:
        neut_text+=df.loc[i]["tweet"]

list_text = [pos_text,neg_text,neut_text]


for txt in list_text:
    word_cloud = WordCloud(width = 600,height = 600,max_font_size = 200).generate(txt)
    plt.figure(figsize=(12,10))# create a new figure
    plt.imshow(word_cloud,interpolation="bilinear")
    plt.axis("off")
    plt.show()


## 6. Results By Using Logistic Regression

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# Create the pandas DataFrame 
data = pd.DataFrame(df_copy, columns = ['tweet', 'sent_score']) 

In [None]:
data.head()

In [None]:
# Convert the text data into sequence using text_to_sequence method

def text2seq(data):
    max_fatures = 2000
    tokenizer = Tokenizer(num_words = max_fatures, split=' ')
    tokenizer.fit_on_texts(data['tweet'].values)
    X = tokenizer.texts_to_sequences(data['tweet'].values)
    X = pad_sequences(X)
    return X

X = text2seq(data)
print("Dimension of the input data after text_to_sequence method: ", X.shape)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
def Tf_idf(data):
    tweet_data = data['tweet']
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(tweet_data)
    print(vectorizer.get_feature_names())
    return X

X = Tf_idf(data)
#print("Dimension of input data after tf-idf vectorization:", X.shape)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
def count_vectorizer(data):
    tweet_data = data['tweet']
    bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')

    # bag-of-words feature matrix
    bow = bow_vectorizer.fit_transform(tweet_data)

    df_bow = pd.DataFrame(bow.todense())

    return df_bow
X = count_vectorizer(data)


In [None]:
from sklearn.model_selection import train_test_split
# Y = pd.get_dummies(data['Label']).values

def split_train_test(X, Y):
    Y=data["sent_score"].values
    print(Y)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20)
    print("Dimension of training features and label: ", X_train.shape,Y_train.shape)
    print("Dimension of testing features and label: ", X_test.shape,Y_test.shape)

    return X_train, X_test, Y_train, Y_test

Y = data['sent_score']
X_train, X_test, Y_train, Y_test = split_train_test(X, Y)

## 6.1 Accuracy of the Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

def Logistic_regression_model(X_train, X_test, Y_train, Y_test):

    logreg = LogisticRegression(C=100, max_iter=500)
    logreg.fit(X_train, Y_train)
    y_pred = logreg.predict(X_test)

    print("accuracy score on the logistic regression model:", accuracy_score(y_pred, Y_test))
    return y_pred
y_pred = Logistic_regression_model(X_train, X_test, Y_train, Y_test)

In [None]:
import matplotlib.pyplot as plt
data['sent_score'].value_counts().sort_index().plot(kind='bar', title='Sentiment Count', color='seagreen')