In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd 
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

from textblob import TextBlob

## Reading and understanding the data

In [None]:
df = pd.read_csv('../input/demonetization-in-india-twitter-data/demonetization-tweets.csv',encoding='ISO-8859-1')
df.head()

In [None]:
df.shape

In [None]:
# Checking is there is any missing valaues in tweet
df['text'].isnull().sum()

## Cleaning the tweets

In [None]:
df = df['text']
df = pd.DataFrame({'tweet':df})
df.head()

In [None]:
# Removing few characters 
df['cleaned_tweet'] = df['tweet'].replace(r'\'|\"|\,|\.|\?|\+|\-|\/|\=|\(|\)|\n|"', '', regex=True)
# Replacing few double spaces with single space
df['cleaned_tweet'] = df['cleaned_tweet'].replace("  ", " ")

# remove emoticons form the tweets
df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'<ed>','', regex = True)
df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'\B<U+.*>|<U+.*>\B|<U+.*>','', regex = True)

# convert tweets to lowercase
df['cleaned_tweet'] = df['cleaned_tweet'].str.lower()
    
#remove user mentions
df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'^(@\w+)',"", regex=True)
    
#remove 'rt' in the beginning
df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'^(rt @)',"", regex=True)
    
#remove_symbols
df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'[^a-zA-Z0-9]', " ", regex=True)

#remove punctuations 
df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'[[]!"#$%\'()\*+,-./:;<=>?^_`{|}]+',"", regex = True)

#remove_URL(x):
df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'https.*$', "", regex = True)

#remove 'amp' in the text
df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'amp',"", regex = True)

#remove words of length 1 or 2 
df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'\b[a-zA-Z]{1,2}\b','', regex=True)

#remove extra spaces in the tweet
df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'^\s+|\s+$'," ", regex=True)



Now we have the cleaned_tweet. But the stop words are still present. We can use this cleaned_tweet forcreating phrases and ranking the phrases from the tweets.

We will also remove the stop words and name the column as fully_cleaned_tweet. This will be used for clustering the sentiments.

In [None]:
# list of words to remove
words_to_remove = ["ax","i","you","edu","s","t","m","subject","can","lines","re","what", "there","all","we",
                "one","the","a","an","of","or","in","for","by","on","but","is","in","a","not","with","as",
                "was","if","they","are","this","and","it","have","has","from","at","my","be","by","not","that","to",
                "from","com","org","like","likes","so","said","from","what","told","over","more","other",
                "have","last","with","this","that","such","when","been","says","will","also","where","why",
                "would","today", "in", "on", "you", "r", "d", "u", "hw","wat", "oly", "s", "b", "ht", 
                "rt", "p","the","th", "n", "was", "via"]

#remove stopwords and words_to_remove
stop_words = set(stopwords.words('english'))
mystopwords = [stop_words, words_to_remove]

df['fully_cleaned_tweet'] = df['cleaned_tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in mystopwords]))

In [None]:
df.head()

In [None]:
# Assigning sentiment for each tweet which ranges between -1 to +1
# -1.0 is a negative polarity 
# 1.0 is a positive
# 0 is a neutral polarity
df['sentiment'] = df['fully_cleaned_tweet'].apply(lambda x: TextBlob(x).sentiment.polarity)
df.head()

## Vectorize the tweets

In [None]:
# Creating tokens for each tweet
df['tokenized_tweet'] = df['fully_cleaned_tweet'].apply(word_tokenize)
df.head()

In [None]:
#if a word has a digit, remove that word
df['tokenized_tweet'] = df['tokenized_tweet'].apply(lambda x: [y for y in x if not any(c.isdigit() for c in y)])

In [None]:
# Set values for various parameters
num_features = 100    # Word vector dimensionality                      
min_word_count = 1   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size   

In [None]:
# Initialize and train the model
from gensim.models import word2vec

model = word2vec.Word2Vec(df['tokenized_tweet'], 
                          workers=num_workers, 
                          size=num_features, 
                          min_count = min_word_count, 
                          window = context)

# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

### Find vector corresponding to each tweet
Take the average of all word vectors in a tweet. This way we will find the vector for each tweet.

In [None]:
import numpy as np

vocab = list(model.wv.vocab)

def sentence_vector(sentence, model):
    nwords = 0
    featureV = np.zeros(100, dtype="float32")
    for word in sentence:
        featureV = np.add(featureV, model[word])
        nwords = nwords + 1
        
    featureV = np.divide(featureV, nwords)
    return featureV

tweet_vector = df['tokenized_tweet'].apply(lambda x: sentence_vector(x, model))  

tweet_vector = tweet_vector.apply(pd.Series)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
tweet_vector = scaler.fit_transform(tweet_vector)
tweet_vector = pd.DataFrame(tweet_vector)
tweet_vector

### Add sentiment to the tweet vector

In [None]:
#Scale the 'sentiment' vector, as the sentiment varies from -1 to +1
def sentiment(x):
    if x < 0.04:
        return 0
    elif x > 0.04:
        return 1
    else:
        return 0.5

# Adding sentiment to the 100th dimension
tweet_vector[100] = df['sentiment'].apply(lambda x: sentiment(x))

tweet_vector

In [None]:
#Updating the 'sentiment' column in df also
df['sentiment'] = tweet_vector[100]

## Cluster the narratives [= opinions + expressions]

In [None]:
# List of indices of missing value rows
missing_row_indices = tweet_vector[tweet_vector.isnull().any(axis=1)].index.to_list()
print(missing_row_indices)

In [None]:
# Dropping the rows with any missing values
tweet_vector = tweet_vector.dropna()

In [None]:
# Dropping the same rows dropped in tweet_vector
df = df.drop(missing_row_indices)

In [None]:
# K-Means clustering of the tweet vectors

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples

range_n_clusters = [3, 4, 5, 6, 7, 8, 9, 10]
X = tweet_vector
n_best_clusters = 0
silhouette_best = 0

for n_clusters in range_n_clusters:
    
    # Initialize the clusterer with n_clusters value and a random generator
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(X)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
                                      
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)
    
    if silhouette_avg > silhouette_best:
        silhouette_best = silhouette_avg
        n_best_clusters = n_clusters

In [None]:
# Best number of cluster
n_best_clusters

In [None]:
# Clustering with n_best_clusters
clusterer = KMeans(n_clusters= n_best_clusters , random_state=10)
cluster_labels = clusterer.fit_predict(X)

In [None]:
clusters = np.unique(cluster_labels)  
print(clusters)

In [None]:
#Array of tweets, the corresponding cluster number, sentiment
finaldf = pd.DataFrame({'cl_num': cluster_labels,'fully_cleaned_tweet': df['fully_cleaned_tweet'], 'cleaned_tweet': df['cleaned_tweet'], 'tweet': df['tweet'],'sentiment': df['sentiment']})
finaldf = finaldf.sort_values(by=['cl_num'])
finaldf.head()

In [None]:
# Adding cluster numbers to df as well
df['cl_num'] = cluster_labels
df.head()

In [None]:
dfOrdered = pd.DataFrame(df)

#Compute how many times a tweet has been 'retweeted' - that is, how many rows in dfOrdered are identical
dfOrdered['tokenized_tweet'] = dfOrdered['tokenized_tweet'].apply(tuple)
dfUnique = dfOrdered.groupby(['tweet', 'cleaned_tweet', 'fully_cleaned_tweet', 'sentiment','tokenized_tweet', 'cl_num']).size().reset_index(name="freq")
dfUnique = dfUnique.sort_values(by=['cl_num'])

In [None]:
dfUnique['tokenized_tweet'] = dfUnique['tokenized_tweet'].apply(list)
dfOrdered['tokenized_tweet'] = dfOrdered['tokenized_tweet'].apply(list)

In [None]:
# We can see that there are around 5000 unique tweets
dfUnique.shape

# Calculate abstraction and expression for each narrative


* Abstraction: What the opinion is about, for e.g. an opinion on demonetisation can be 'about a topic' such as 'Digital India', corruption, PM Modi, etc.
* Expression: The 'sentiment' of the opinion, i.e. positive, negative or neutral.

Each cluster represents a narrative. In other words, people start supporting other people having similar opinions, and as a result, opinions turn into narratives.

In [None]:
#Store all tweets corresponding to each cluster in a file
for i in clusters:
    with open('./tweets_Cluster_'+str(i)+'.txt','w') as out:
        y = ''
        for x in dfUnique['fully_cleaned_tweet'][dfUnique.cl_num == i]:    
            y = y + x + '. '
        out.write(y)
        out.close()

In [None]:
#A combination of (Noun, adjective, cardinal number, foreign word and Verb) are being extracted now
#Extract chunks matching pattern. Patterns are:
#1) Noun phrase (2 or more nouns occurring together. Ex United states of America, Abdul Kalam etc)
#2) Number followed by Noun (Ex: 28 Terrorists, 45th President)
#3) Adjective followed by Noun (Ex: Economic impact, beautiful inauguration)
#4) Foreign word (Ex: Jallikattu, Narendra modi)
#5) Noun followed by Verb (Ex: Terrorists arrested)
#And a combination of all 5
        
import re
import nltk

phrases = pd.DataFrame({'extracted_phrases': [], 'cluster_num': []})


A = '(CD|JJ)/\w+\s'  #cd or jj
B = '(NN|NNS|NNP|NNPS)/\w+\s'  #nouns
C = '(VB|VBD|VBG|VBN|VBP|VBZ)/\w+\s' #verbs
D = 'FW/\w+\s'  #foreign word
patterns = ['('+A+B+')+', '('+D+B+')+','('+D+')+', '('+B+')+', '('+D+A+B+')+', 
           '('+B+C+')+', '('+D+B+C+')+', '('+B+A+B+')+', '('+B+B+C+')+'] 


def extract_phrases(tag1, tag2, sentences):
    extract_phrase = []
    for sentence in sentences:
        phrase = []
        next_word = 0
        for word, pos in nltk.pos_tag(nltk.word_tokenize(sentence)):
            if next_word == 1:
                next_word = 0
                if pos == tag2:
                    extract_phrase = np.append(extract_phrase,phrase + ' ' + word) 
            
            if pos == tag1:
                next_word = 1
                phrase = word
    return extract_phrase

for i in clusters:
    File = open('./tweets_Cluster_'+str(i)+'.txt', 'r') #open file
    lines = File.read() #read all lines
    sentences = nltk.sent_tokenize(lines) #tokenize sentences

    for sentence in sentences: 
        f = nltk.pos_tag(nltk.word_tokenize(sentence))
        tag_seq = []
        for word, pos in f:
            tag_seq.append(pos+'/'+ word)
        X = " ".join(tag_seq)

        phrase = []
        for j in range(len(patterns)):
            if re.search(patterns[j], X):
                phrase.append(' '.join([word.split('/')[1] for word in re.search(patterns[j], X).group(0).split()]))
    
        k = pd.DataFrame({'extracted_phrases': np.unique(phrase), 'cluster_num': int(i)})
    
        phrases = pd.concat([phrases,k], ignore_index = True)

print(phrases)

### Keeping the largest phrase

In [None]:
#For each phrase identified replace all the substrings by the largest phrase 
#Ex: lakh looted,40 lakh looted and Rs 40 lakh looted, replace all by single largest phrase - Rs 40 lakh looted 
#i.e. instead of 3 different phrases, there will be only one large phrase

phrases_final = pd.DataFrame({'extracted_phrases': [], 'cluster_num': []})
for i in clusters:
    phrases_for_each_cluster = []
    cluster_phrases = phrases['extracted_phrases'][phrases.cluster_num == i]
    cluster_phrases = np.unique(np.array(cluster_phrases))
    for j in range(len(cluster_phrases)):
        
        phrase = cluster_phrases[j]
        updated_cluster_phrases = np.delete((cluster_phrases), j)
        if any(phrase in phr for phr in updated_cluster_phrases): 
            'y'
        else: 
            #considering phrases of length greater than 1 only
            if (len(phrase.split(' '))) > 1:
                phrases_for_each_cluster.append(phrase)
    k = pd.DataFrame({'extracted_phrases': phrases_for_each_cluster, 'cluster_num': int(i) })
    
    phrases_final = pd.concat([phrases_final,k], ignore_index = True)

In [None]:
phrases_final

# Calculate TF-IDF score 

## For each phrase in each cluster, calculate term frequency

In [None]:
dfUnique.head()

In [None]:
#Term-frequency : For each cluster, calculate the number of times a given phrase occur in the tweets of that cluster

phrases_final['term_freq'] = len(phrases_final)*[0]

for i in clusters:
    for phrase in phrases_final['extracted_phrases'][phrases_final.cluster_num == i]:
        tweets = dfUnique['fully_cleaned_tweet'][dfUnique.cl_num == i]
        for tweet in tweets:
            if phrase in tweet:
                phrases_final['term_freq'][(phrases_final.extracted_phrases == phrase) & (phrases_final.cluster_num == i)] = phrases_final['term_freq'][(phrases_final.extracted_phrases == phrase) & (phrases_final.cluster_num == i)] + 1

### For each phrase in each cluster, calculate document frequency

In [None]:
#Document-frequency
phrases_final['doc_freq'] = len(phrases_final)*[0]


# for each phrase, compute the number of clusters that Sphrase occurs in
for phrase in phrases_final['extracted_phrases']:
    for i in clusters:
        all_tweets = ''
        for tweet in dfUnique['fully_cleaned_tweet'][dfUnique.cl_num == i]:
            all_tweets = all_tweets + tweet + '. ' 
        if phrase in all_tweets:
            phrases_final['doc_freq'][(phrases_final.extracted_phrases == phrase) & (phrases_final.cluster_num == i)] = phrases_final['doc_freq'][(phrases_final.extracted_phrases == phrase) & (phrases_final.cluster_num == i)] + 1
        

In [None]:
# Calculate IDF
import math
phrases_final['doc_freq'] = phrases_final['doc_freq'].apply(lambda x: math.log10(n_best_clusters/(x)) )

### For each phrase in each cluster, calculate tf-idf

In [None]:
# Calculate TF-IDF
# TF X IDF
phrases_final['tf-idf'] = phrases_final['term_freq']*phrases_final['doc_freq']

In [None]:
phrases_final

## For each cluster find top few phrases and respective sentiment

In [None]:
phrases_final['diff_tf-idf'] = len(phrases_final)*[0]

narrative = pd.DataFrame({'cl_num': [], 'abstraction': []})
for i in clusters: 
    # arrange in descending order of tf-idf score
    phrases_final = phrases_final.sort_values(['cluster_num','tf-idf'], ascending=[1,0])
    
    #Break this distribution at a point where the difference between any consecutive phrases is maximum
    #difference between consecutive values of tf-idf 
    phrases_final['diff_tf-idf'][phrases_final.cluster_num == i] = abs(phrases_final['tf-idf'][phrases_final.cluster_num == i] - phrases_final['tf-idf'][phrases_final.cluster_num == i].shift(1))

    #The last value for each cluster will be 'NaN'. Replacing it with '0'. 
    phrases_final = phrases_final.fillna(0)
    
    phrases_final = phrases_final.reset_index(drop = True) #to avoid old index being added as a new column
    if len(phrases_final[phrases_final.cluster_num == i]) != 0:
        
        #index corresponding to the highest difference
 
        ind = (phrases_final['diff_tf-idf'][phrases_final.cluster_num == i]).idxmax()
        
        abstract = phrases_final['extracted_phrases'][:ind+1][phrases_final.cluster_num == i]
    
    
        #store the abstraction corresponding to each cluster
        k = pd.DataFrame({'cl_num': int(i), 'abstraction': abstract})
        narrative = pd.concat([narrative,k], ignore_index = True)

In [None]:
narrative

In [None]:
#Assigning polarity based on the sentiment for each tweet 2=negative, 1=positive, 3=neutral
dfUnique['polarity'] = np.NaN
dfUnique['polarity'][dfUnique.sentiment == 0.5] = "3"
dfUnique['polarity'][dfUnique.sentiment == 1] = "1"
dfUnique['polarity'][dfUnique.sentiment == 0] = "2"

## Assign the sentiment to each extracted phrases
count the number of tweets, a phrase has occurred in positive, negative and neutral context. Assign the most occurred sentiment to the phrase

In [None]:
from collections import Counter

#find the highest occurring sentiment corresponding to each tweet
def find_mode(a):
    b = Counter(a).most_common(3)
    mode = []; c_max = 0
    for a,c in b:
        if c>c_max:
            c_max = c
        if c_max == c:
            mode.append(a)  
    print(mode)
    mode.sort()
    print(mode)
    
    ## if mode is 3&2 i.e. neutral and negative, assign the overall sentiment for that phrase as negative, 
    ## if mode is 3&1 i.e. neutral and positive, assign the overall sentiment for that phrase as positive,
    ## if mode is 2&1 i.e. negative and positive, assign the overall sentiment for that phrase as neutal, 
    ## if mode is 3&2&1 i.e. negative, positive and neutral, assign the overall sentiment for that phrase as neutral
    
    if len(mode) == 1:
        return mode[0]
    
    elif (len(mode) == 2) & (mode[1]=='3'):
        return mode[0]
    else:
        return 3
    
#1=>+ve 2=>-ve 3=>Neutral
narrative['expression'] = -1
dfUnique = dfUnique.reset_index(drop = True)
for i in clusters:
    tweets = dfUnique['fully_cleaned_tweet'][dfUnique.cl_num == i]
    abstracts = narrative['abstraction'][narrative.cl_num == i] 
    for abst in abstracts:
        sent = []
        for tweet, polarity in zip(dfUnique['fully_cleaned_tweet'][dfUnique.cl_num == i], dfUnique['polarity'][dfUnique.cl_num == i]):
            if abst in tweet:
                sent = np.append(sent, polarity)
        
        
        if len(sent)!=0:
            ## if mode is 3&2-2, 3&1-1, 2&1-3, 3&2&1 - 3
            senti = find_mode(sent)
            if senti == '2':
                sent_value = "Negative"
            elif senti == '1':
                sent_value = "Positive"
            else:
                sent_value = "Neutral"
            narrative['expression'][(narrative.abstraction == abst) & (narrative.cl_num == i)] = sent_value
        

In [None]:
narrative

In [None]:
# Plotting the sentiments
import seaborn as sns
sns.countplot(narrative['expression'])

In [None]:
!pip install openpyxl

In [None]:
from pandas import ExcelWriter

#Save the narratives in an excel file 

writer = pd.ExcelWriter('narrative.xlsx')
for i in clusters:
    df1 = pd.DataFrame(dfUnique[['tweet','freq']][dfUnique.cl_num == i]).sort_values(['freq'], ascending = [0])
    df1 = pd.DataFrame({'tweet': dfUnique['tweet'][dfUnique.cl_num == i], 'freq': dfUnique['freq'][dfUnique.cl_num == i]}) 
    df1 = df1.sort_values(['freq'], ascending = [0]) 

    df2 = pd.DataFrame({ 'abstraction': narrative['abstraction'][narrative.cl_num == i], 'expression': narrative['expression'][narrative.cl_num == i]})
    df3 = pd.DataFrame({'abstraction': (len(df1)-len(df2))*['-'], 'expression': (len(df1)-len(df2))*['-']})
    df2 = df2.append(df3)

    df1 = df1.reset_index(drop=True)
    df2 = df2.reset_index(drop=True)
    df1['abstraction'] = df2['abstraction']
    df1['expression'] = df2['expression']

    df1.to_excel(writer,'narrative_cluster'+str(i))

writer.save()
    