In [1]:
keyword = "demonetization" 
file_name = "demonetization-tweets"

In [2]:
import numpy as np
import pandas as pd 
import os 
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
import chardet
from textblob import TextBlob
from gensim.models import word2vec
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
pd.set_option('mode.chained_assignment', None)

# Step 1: Import data

In [3]:
# Identify the encoding of the data file
with open('./demonetization-tweets.csv','rb') as f:
    result = chardet.detect(f.read())  #Windows-1252

# Import the data file
df = pd.read_csv("./"+file_name+".csv", encoding=result['encoding'])
#df = df[1:1000+1]
df = df['text']
df = pd.DataFrame({'tweet':df})    

# Step 2: Clean the tweets


In [4]:
# Clean the tweets    
df['cleaned_tweet'] = df['tweet'].replace(r'\'|\"|\,|\.|\?|\+|\-|\/|\=|\(|\)|\n|"', '', regex=True)
df['cleaned_tweet'] = df['cleaned_tweet'].replace("  ", " ")

words_remove = ["ax","i","you","edu","s","t","m","subject","can","lines","re","what", "there","all","we",
                "one","the","a","an","of","or","in","for","by","on","but","is","in","a","not","with","as",
                "was","if","they","are","this","and","it","have","has","from","at","my","be","by","not","that","to",
                "from","com","org","like","likes","so","said","from","what","told","over","more","other",
                "have","last","with","this","that","such","when","been","says","will","also","where","why",
                "would","today", "in", "on", "you", "r", "d", "u", "hw","wat", "oly", "s", "b", "ht", 
                "rt", "p","the","th", "n", "was"]


def cleantext(df, words_to_remove = words_remove): 
    ### dont change the original tweet
    # remove emoticons form the tweets
    df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'<ed>','', regex = True)
    df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'\B<U+.*>|<U+.*>\B|<U+.*>','', regex = True)
    
    # convert tweets to lowercase
    df['cleaned_tweet'] = df['cleaned_tweet'].str.lower()
    
    #remove user mentions
    df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'^(@\w+)',"", regex=True)
    
    #remove 'rt' in the beginning
    df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'^(rt @)',"", regex=True)
    
    #remove_symbols
    df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'[^a-zA-Z0-9]', " ", regex=True)

    #remove punctuations 
    df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'[[]!"#$%\'()\*+,-./:;<=>?^_`{|}]+',"", regex = True)

    #remove_URL(x):
    df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'https.*$', "", regex = True)

    #remove 'amp' in the text
    df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'amp',"", regex = True)
    
    #remove words of length 1 or 2 
    df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'\b[a-zA-Z]{1,2}\b','', regex=True)

    #remove extra spaces in the tweet
    df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'^\s+|\s+$'," ", regex=True)
     
    
    #remove stopwords and words_to_remove
    stop_words = set(stopwords.words('english'))
    mystopwords = [stop_words, "via", words_to_remove]
    
    df['fully_cleaned_tweet'] = df['cleaned_tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in mystopwords]))
    

    return df

#get the processed tweets
df = cleantext(df)

  re.compile(obj)


In [5]:
#Sentiment Analysis

df['sentiment'] = df['fully_cleaned_tweet'].apply(lambda x: TextBlob(x).sentiment.polarity)  #-1 to 1

In [6]:
df.head()

Unnamed: 0,tweet,cleaned_tweet,fully_cleaned_tweet,sentiment
0,RT @rssurjewala: Critical question: Was PayTM ...,rssurjewala critical question was paytm info...,rssurjewala critical question was paytm inform...,0.15
1,RT @Hemant_80: Did you vote on #Demonetization...,hemant 80 did you vote demonetization modi...,hemant 80 did you vote demonetization modi sur...,0.0
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",roshankar former finsec rbi governor cbdt ch...,roshankar former finsec rbi governor cbdt chai...,0.0
3,RT @ANI_news: Gurugram (Haryana): Post office ...,ani news gurugram haryana post office employ...,ani news gurugram haryana post office employee...,0.0
4,RT @satishacharya: Reddy Wedding! @mail_today ...,satishacharya reddy wedding mail today cart...,satishacharya reddy wedding mail today cartoon...,0.0


# Step 3: Vectorize the tweets

In [7]:
df['tokenized_tweet'] = df['fully_cleaned_tweet'].apply(word_tokenize)

In [8]:
df

Unnamed: 0,tweet,cleaned_tweet,fully_cleaned_tweet,sentiment,tokenized_tweet
0,RT @rssurjewala: Critical question: Was PayTM ...,rssurjewala critical question was paytm info...,rssurjewala critical question was paytm inform...,0.150000,"[rssurjewala, critical, question, was, paytm, ..."
1,RT @Hemant_80: Did you vote on #Demonetization...,hemant 80 did you vote demonetization modi...,hemant 80 did you vote demonetization modi sur...,0.000000,"[hemant, 80, did, you, vote, demonetization, m..."
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",roshankar former finsec rbi governor cbdt ch...,roshankar former finsec rbi governor cbdt chai...,0.000000,"[roshankar, former, finsec, rbi, governor, cbd..."
3,RT @ANI_news: Gurugram (Haryana): Post office ...,ani news gurugram haryana post office employ...,ani news gurugram haryana post office employee...,0.000000,"[ani, news, gurugram, haryana, post, office, e..."
4,RT @satishacharya: Reddy Wedding! @mail_today ...,satishacharya reddy wedding mail today cart...,satishacharya reddy wedding mail today cartoon...,0.000000,"[satishacharya, reddy, wedding, mail, today, c..."
...,...,...,...,...,...
14935,RT @saxenavishakha: Ghost of demonetization re...,saxenavishakha ghost demonetization returns ...,saxenavishakha ghost demonetization returns wi...,0.000000,"[saxenavishakha, ghost, demonetization, return..."
14936,N d modi fans-d true nationalists of the count...,modi fansd true nationalists the country sti...,modi fansd true nationalists the country stil ...,-0.105556,"[modi, fansd, true, nationalists, the, country..."
14937,RT @bharat_builder: Lol. Demonetization has fi...,bharat builder lol demonetization has fixed ...,bharat builder lol demonetization has fixed lo...,0.183333,"[bharat, builder, lol, demonetization, has, fi..."
14938,RT @Stupidosaur: @Vidyut B team of BJP. CIA ba...,stupidosaur vidyut team bjp cia baby cctv ...,stupidosaur vidyut team bjp cia baby cctv evm ...,0.000000,"[stupidosaur, vidyut, team, bjp, cia, baby, cc..."


In [9]:
#if a word has a digit, remove that word
df['tokenized_tweet'] = df['tokenized_tweet'].apply(lambda x: [y for y in x if not any(c.isdigit() for c in y)])

In [10]:
# Set values for various parameters
num_features = 100    # Word vector dimensionality                      
min_word_count = 1   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    

In [11]:
# Initialize and train the model (this will take some time)
print("Training model...")
model = word2vec.Word2Vec(df['tokenized_tweet'], workers=num_workers, \
            vector_size=num_features, min_count = min_word_count, \
            window = context)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

Training model...


### Find vector corresponding to each tweet
Take the average of all word vectors in a tweet

In [12]:
vocab = list(model.wv.key_to_index)

In [13]:
vocab

['demonetization',
 'the',
 'india',
 'and',
 'modi',
 'that',
 'out',
 'for',
 'who',
 'had',
 'narendra',
 'rich',
 'after',
 'find',
 'dear',
 'implement',
 'evanspiegel',
 'actually',
 'from',
 'have',
 'urautelaforever',
 'was',
 'people',
 'narendramodi',
 'bank',
 'will',
 'has',
 'this',
 'how',
 'are',
 'with',
 'you',
 'about',
 'cash',
 'impact',
 'lakh',
 'support',
 'its',
 'such',
 'terrorists',
 'all',
 'over',
 'against',
 'not',
 'nation',
 'since',
 'thats',
 'they',
 'move',
 'third',
 'incident',
 'looted',
 'modibharosa',
 'kishtwar',
 'gauravcsawant',
 'youtube',
 'why',
 'money',
 'question',
 'back',
 'but',
 'across',
 'atms',
 'like',
 'now',
 'more',
 'his',
 'due',
 'says',
 'supports',
 'goes',
 'paytm',
 'our',
 'drkumarvishwas',
 'oscar',
 'should',
 'app',
 'govt',
 'whether',
 'good',
 'still',
 'modis',
 'bjp',
 'can',
 'notes',
 'full',
 'when',
 'shortage',
 'poor',
 'party',
 'clearly',
 'your',
 'what',
 'critical',
 'rssurjewala',
 'bypolls',
 'in

In [14]:
vocab = list(model.wv.key_to_index)
def sentence_vector(sentence, model):
    nwords = 0
    featureV = np.zeros(100, dtype="float32")
    for word in sentence:
        if word not in vocab:
            continue
        featureV = np.add(featureV, model.wv[word])
        nwords = nwords + 1
    if nwords > 0: 
        featureV = np.divide(featureV, nwords)
    return featureV

tweet_vector = df['tokenized_tweet'].apply(lambda x: sentence_vector(x, model))  

tweet_vector = tweet_vector.apply(pd.Series)

In [15]:
#Tweet vector should vary from 0 to 1 (normalize the vector)
for x in range(len(tweet_vector)):
    x_min = tweet_vector.iloc[x].min()
    x_max = tweet_vector.iloc[x].max()
    X  = tweet_vector.iloc[x]
    i = 0
    if (x_max - x_min) == 0:
        for y in X:
            tweet_vector.iloc[x][i] = (1/len(tweet_vector.iloc[x]))
            i = i + 1
    else:
        for y in X:
            tweet_vector.iloc[x][i] = ((y - x_min)/(x_max - x_min))
            i = i + 1

In [16]:
print(tweet_vector)

             0         1         2         3         4         5         6    
0     -0.119772  0.012541  0.022908 -0.012207  0.093855 -0.107808  0.065990  \
1     -0.151624  0.067410  0.001232  0.033011  0.012311 -0.053566  0.063053   
2     -0.109865  0.079275 -0.036803  0.069708  0.015164 -0.128124  0.172709   
3     -0.091764  0.031438 -0.030831  0.067390  0.020482 -0.175661  0.157000   
4     -0.070429  0.102370  0.042013  0.006973  0.018645  0.000000  0.194049   
...         ...       ...       ...       ...       ...       ...       ...   
14935 -0.071265  0.062739 -0.027764  0.063051  0.047983 -0.116335  0.138535   
14936 -0.108562  0.069429 -0.043389  0.065047  0.022246 -0.092271  0.109841   
14937 -0.093982  0.031785 -0.115378  0.073162 -0.010726 -0.047210  0.154450   
14938 -0.069449  0.029856 -0.053555  0.064788  0.055495 -0.078229  0.143374   
14939 -0.065060  0.028290 -0.058571  0.071047  0.054906 -0.065682  0.145076   

             7         8         9   ...        90 

# Step 4: Add sentiment to the tweet vector

In [17]:
#Scale the 'sentiment' vector
#Sentiment varies from -1 to +1

def sentiment(x):
    if x < 0.04:
        return 0
    elif x > 0.04:
        return 1
    else:
        return 0.5

tweet_vector[100] = df['sentiment'].apply(lambda x: sentiment(x))

In [18]:
tweet_vector  #sentiment 0 to +1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,-0.119772,0.012541,0.022908,-0.012207,0.093855,-0.107808,0.065990,0.193632,0.124113,0.073122,...,0.128465,0.047848,-0.132761,1.000000,0.103569,0.035771,0.000000,-0.145036,0.065566,1
1,-0.151624,0.067410,0.001232,0.033011,0.012311,-0.053566,0.063053,0.145116,0.120084,0.106455,...,0.033400,0.034057,-0.122641,1.000000,0.078306,-0.007952,-0.163343,-0.068051,0.080755,0
2,-0.109865,0.079275,-0.036803,0.069708,0.015164,-0.128124,0.172709,0.149966,0.103798,0.108282,...,0.050000,-0.007982,-0.064162,0.191141,0.221205,-0.044450,-0.139647,-0.128860,0.063375,0
3,-0.091764,0.031438,-0.030831,0.067390,0.020482,-0.175661,0.157000,0.155331,0.066747,0.067203,...,0.038812,-0.020445,-0.052864,0.147307,0.228458,-0.040328,-0.134557,-0.146528,0.043255,0
4,-0.070429,0.102370,0.042013,0.006973,0.018645,0.000000,0.194049,0.108037,0.021510,0.047469,...,0.039842,0.063686,-0.000034,0.165006,0.174933,0.021646,-0.140149,-0.032286,0.092191,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14935,-0.071265,0.062739,-0.027764,0.063051,0.047983,-0.116335,0.138535,0.078707,0.035290,0.075781,...,0.030878,-0.063327,-0.006459,0.071347,0.157959,-0.041671,-0.092269,-0.112490,0.016546,0
14936,-0.108562,0.069429,-0.043389,0.065047,0.022246,-0.092271,0.109841,0.141071,0.107169,0.090184,...,0.044461,0.029142,-0.105415,0.157596,0.155227,-0.028157,-0.108655,-0.143632,0.053702,0
14937,-0.093982,0.031785,-0.115378,0.073162,-0.010726,-0.047210,0.154450,0.049487,0.102480,0.076090,...,0.037369,-0.051818,-0.079735,0.068371,0.179009,-0.081439,-0.060286,-0.143351,0.050386,1
14938,-0.069449,0.029856,-0.053555,0.064788,0.055495,-0.078229,0.143374,0.137145,0.123738,0.092735,...,0.038365,-0.025350,-0.103527,0.146800,0.145423,-0.039902,-0.070741,-0.142590,0.067799,0


In [19]:
#Updating the 'sentiment' column in df also
df['sentiment'] = tweet_vector[100]

# Step 5: Cluster the narratives [= opinions + expressions]

In [20]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples

range_n_clusters = [4, 5, 6, 7, 8, 9, 10, 11]
X = tweet_vector
n_best_clusters = 0
silhouette_best = 0
for n_clusters in range_n_clusters:
    
    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=10, n_init=10)
    cluster_labels = clusterer.fit_predict(X)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
                                      #, sample_size = 5000)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)
    
    if silhouette_avg > silhouette_best:
        silhouette_best = silhouette_avg
        n_best_clusters = n_clusters

For n_clusters = 4 The average silhouette_score is : 0.280086391610312
For n_clusters = 5 The average silhouette_score is : 0.3193228444215377
For n_clusters = 6 The average silhouette_score is : 0.3438172617058945
For n_clusters = 7 The average silhouette_score is : 0.3755095998110072
For n_clusters = 8 The average silhouette_score is : 0.3976750583156838
For n_clusters = 9 The average silhouette_score is : 0.41230399283149943
For n_clusters = 10 The average silhouette_score is : 0.42697608236465606
For n_clusters = 11 The average silhouette_score is : 0.44016286295969326


In [21]:
n_best_clusters

11

In [22]:
clusterer = KMeans(n_clusters= n_best_clusters , random_state=10, n_init=10)
cluster_labels = clusterer.fit_predict(X)

In [23]:
np.unique(cluster_labels)  

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10], dtype=int32)

In [24]:
#Array of tweets, the corresponding cluster number, sentiment
finaldf = pd.DataFrame({'cl_num': cluster_labels,'fully_cleaned_tweet': df['fully_cleaned_tweet'], 'cleaned_tweet': df['cleaned_tweet'], 'tweet': df['tweet'],'sentiment': df['sentiment']})
finaldf = finaldf.sort_values(by=['cl_num'])

In [25]:
df['cl_num'] = cluster_labels

In [26]:
df

Unnamed: 0,tweet,cleaned_tweet,fully_cleaned_tweet,sentiment,tokenized_tweet,cl_num
0,RT @rssurjewala: Critical question: Was PayTM ...,rssurjewala critical question was paytm info...,rssurjewala critical question was paytm inform...,1,"[rssurjewala, critical, question, was, paytm, ...",1
1,RT @Hemant_80: Did you vote on #Demonetization...,hemant 80 did you vote demonetization modi...,hemant 80 did you vote demonetization modi sur...,0,"[hemant, did, you, vote, demonetization, modi,...",5
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",roshankar former finsec rbi governor cbdt ch...,roshankar former finsec rbi governor cbdt chai...,0,"[roshankar, former, finsec, rbi, governor, cbd...",3
3,RT @ANI_news: Gurugram (Haryana): Post office ...,ani news gurugram haryana post office employ...,ani news gurugram haryana post office employee...,0,"[ani, news, gurugram, haryana, post, office, e...",0
4,RT @satishacharya: Reddy Wedding! @mail_today ...,satishacharya reddy wedding mail today cart...,satishacharya reddy wedding mail today cartoon...,0,"[satishacharya, reddy, wedding, mail, today, c...",0
...,...,...,...,...,...,...
14935,RT @saxenavishakha: Ghost of demonetization re...,saxenavishakha ghost demonetization returns ...,saxenavishakha ghost demonetization returns wi...,0,"[saxenavishakha, ghost, demonetization, return...",3
14936,N d modi fans-d true nationalists of the count...,modi fansd true nationalists the country sti...,modi fansd true nationalists the country stil ...,0,"[modi, fansd, true, nationalists, the, country...",9
14937,RT @bharat_builder: Lol. Demonetization has fi...,bharat builder lol demonetization has fixed ...,bharat builder lol demonetization has fixed lo...,1,"[bharat, builder, lol, demonetization, has, fi...",6
14938,RT @Stupidosaur: @Vidyut B team of BJP. CIA ba...,stupidosaur vidyut team bjp cia baby cctv ...,stupidosaur vidyut team bjp cia baby cctv evm ...,0,"[stupidosaur, vidyut, team, bjp, cia, baby, cc...",0


In [27]:
dfOrdered = pd.DataFrame(df)

#Compute how many times a tweet has been 'retweeted' - that is, how many rows in dfOrdered are identical
dfOrdered['tokenized_tweet'] = dfOrdered['tokenized_tweet'].apply(tuple)
dfUnique = dfOrdered.groupby(['tweet', 'cleaned_tweet', 'fully_cleaned_tweet', 'sentiment','tokenized_tweet', 'cl_num']).size().reset_index(name="freq")
dfUnique = dfUnique.sort_values(by=['cl_num'])

In [28]:
dfUnique['tokenized_tweet'] = dfUnique['tokenized_tweet'].apply(list)
dfOrdered['tokenized_tweet'] = dfOrdered['tokenized_tweet'].apply(list)

In [29]:
dfUnique

Unnamed: 0,tweet,cleaned_tweet,fully_cleaned_tweet,sentiment,tokenized_tweet,cl_num,freq
2573,India's experiencing a mobile POS boom followi...,indias experiencing mobile pos boom following...,indias experiencing mobile pos boom following ...,0,"[indias, experiencing, mobile, pos, boom, foll...",0,1
2568,India's attempt to go cashless is turning food...,indias attempt cashless turning food vouche...,indias attempt cashless turning food vouchers ...,0,"[indias, attempt, cashless, turning, food, vou...",0,1
5145,…and modi ji’s Finance Minister Jaitley Lies t...,and modi finance minister jaitley lies the...,and modi finance minister jaitley lies the cor...,0,"[and, modi, finance, minister, jaitley, lies, ...",0,1
2574,India's exports grew fastest in 2016-17 in las...,indias exports grew fastest 201617 last 5 ye...,indias exports grew fastest 201617 last 5 year...,0,"[indias, exports, grew, fastest, last, years, ...",0,1
2575,India's exports grow fastest in last 5 years a...,indias exports grow fastest last 5 years and ...,indias exports grow fastest last 5 years and y...,0,"[indias, exports, grow, fastest, last, years, ...",0,1
...,...,...,...,...,...,...,...
3909,RT @indu_jha: Where are those dogs who started...,indu jha where are those dogs who started bar...,indu jha where are those dogs who started bark...,0,"[indu, jha, where, are, those, dogs, who, star...",10,1
1828,Bhaichung Bhutia protest against #Demonetizati...,bhaichung bhutia protest against demonetizati...,bhaichung bhutia protest against demonetizatio...,0,"[bhaichung, bhutia, protest, against, demoneti...",10,1
506,(Table shows)Demonetization had least effect (...,table showsdemonetization had least effect re...,table showsdemonetization had least effect red...,0,"[table, showsdemonetization, had, least, effec...",10,1
3483,RT @ShirishKunder: 500 Crore wedding of Janard...,shirishkunder 500 crore wedding janardhan re...,shirishkunder 500 crore wedding janardhan redd...,0,"[shirishkunder, crore, wedding, janardhan, red...",10,9


### Discard the clusters with poor Silhouette score

In [30]:
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(X, cluster_labels)

poor_cluster_indices = []
avg_cluster_sil_score = []

for i in range(n_best_clusters):
# Aggregate the silhouette scores for samples belonging to
# cluster i, and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
        avgscore = (np.mean(ith_cluster_silhouette_values))   #average silhouette score for each cluster
        avg_cluster_sil_score = np.append(avg_cluster_sil_score, avgscore)
        print('Cluster',i, ':', avgscore)
        if avgscore < 0.2:
            poor_cluster_indices = np.append(poor_cluster_indices, i)
            
        ith_cluster_silhouette_values.sort()
        size_cluster_i = ith_cluster_silhouette_values.shape[0]

Cluster 0 : 0.3286860500156128
Cluster 1 : 0.6915130692335203
Cluster 2 : 0.20232296695245144
Cluster 3 : 0.5270271683340432
Cluster 4 : 0.6838468218033299
Cluster 5 : 0.48881357777120044
Cluster 6 : 0.5420609738388532
Cluster 7 : 0.8716226396845148
Cluster 8 : 0.5101253470970537
Cluster 9 : 0.4920340611886833
Cluster 10 : 0.6486355697282266


In [31]:
poor_cluster_indices

[]

In [32]:
#remove those rows where cluster value match poor_cluster_indices 
avg_cluster_sil_score_final = []
cluster_name = np.unique(dfOrdered['cl_num'])

if (len(poor_cluster_indices)!=0):
    n_final_clusters = n_best_clusters - len(poor_cluster_indices)
    for i in poor_cluster_indices:
        dfUnique = dfUnique[dfUnique['cl_num'] != i]
    for j in cluster_name:
        if j not in poor_cluster_indices:    
            avg_cluster_sil_score_final = np.append(avg_cluster_sil_score_final, avg_cluster_sil_score[j])
            
    cluster_name = np.unique(dfUnique['cl_num'])

In [33]:
dfUnique['cl_num'] = abs(dfUnique['cl_num'])
dfUnique = dfUnique.sort_values(by=['cl_num'])

In [34]:
dfUnique

Unnamed: 0,tweet,cleaned_tweet,fully_cleaned_tweet,sentiment,tokenized_tweet,cl_num,freq
2573,India's experiencing a mobile POS boom followi...,indias experiencing mobile pos boom following...,indias experiencing mobile pos boom following ...,0,"[indias, experiencing, mobile, pos, boom, foll...",0,1
4109,RT @ram2sun: Don't give ideas to India's wanna...,ram2sun dont give ideas indias wannabe mugab...,ram2sun dont give ideas indias wannabe mugabe ...,0,"[dont, give, ideas, indias, wannabe, mugabe, d...",0,5
4092,RT @priyankac19: Shri Modiji if you have jokes...,priyankac19 shri modiji you have jokes crac...,priyankac19 shri modiji you have jokes crack d...,0,"[shri, modiji, you, have, jokes, crack, demone...",0,1
4108,RT @rajeev_mp: Today again #demonetization deb...,rajeev today again demonetization debate p...,rajeev today again demonetization debate parlm...,0,"[rajeev, today, again, demonetization, debate,...",0,20
4107,RT @rajeev_mp: Today a washout of #parliament....,rajeev today washout parliament cudnt sp...,rajeev today washout parliament cudnt spk bcoz...,0,"[rajeev, today, washout, parliament, cudnt, sp...",0,1
...,...,...,...,...,...,...,...
1646,A maligned perspective by @mihirssharma must b...,maligned perspective mihirssharma must stu...,maligned perspective mihirssharma must stung d...,0,"[maligned, perspective, mihirssharma, must, st...",10,1
142,#Demonetization Cash Crunch in #Andaman Banks ...,demonetization cash crunch andaman banks ca...,demonetization cash crunch andaman banks causi...,0,"[demonetization, cash, crunch, andaman, banks,...",10,1
3286,RT @LivingOnChi: 1/21/17 Demonetization: The S...,livingonchi 12117 demonetization the siniste...,livingonchi 12117 demonetization the sinister ...,0,"[livingonchi, demonetization, the, sinister, a...",10,1
3284,RT @LivingOnChi: .@zerohedge 1/21/17 Demonetiz...,livingonchi zerohedge 12117 demonetization ...,livingonchi zerohedge 12117 demonetization the...,0,"[livingonchi, zerohedge, demonetization, the, ...",10,3


# Step 6: Calculate abstraction and expression for each narrative 
Note that each cluster represents a narrative

In [35]:
tweets_to_consider = 'fully_cleaned_tweet'

In [36]:
final_clusters = np.unique(dfUnique['cl_num'])
print(final_clusters)

[ 0  1  2  3  4  5  6  7  8  9 10]


In [37]:
#Store all tweets corresponding to each cluster in a file
for i in final_clusters:
    with open('./tweets_Cluster_'+str(i)+'.txt','w') as out:
        y = ''
        for x in dfUnique[tweets_to_consider][dfUnique.cl_num == i]:    
            y = y + x + '. '
        out.write(y)
        out.close()

In [38]:
#A combination of (Noun, adjective, cardinal number, foreign word and Verb) are being extracted now
#Extract chunks matching pattern. Patterns are:
#1) Noun phrase (2 or more nouns occurring together. Ex United states of America, Abdul Kalam etc)
#2) Number followed by Noun (Ex: 28 Terrorists, 45th President)
#3) Adjective followed by Noun (Ex: Economic impact, beautiful inauguration)
#4) Foreign word (Ex: Jallikattu, Narendra modi, Pappu)
#5) Noun followed by Verb (Ex: Terrorists arrested)
#And a combination of all 5
        
import re
import nltk

phrases = pd.DataFrame({'extracted_phrases': [], 'cluster_num': []})


A = '(CD|JJ)/\w+\s'  #cd or jj
B = '(NN|NNS|NNP|NNPS)/\w+\s'  #nouns
C = '(VB|VBD|VBG|VBN|VBP|VBZ)/\w+\s' #verbs
D = 'FW/\w+\s'  #foreign word
patterns = ['('+A+B+')+', '('+D+B+')+','('+D+')+', '('+B+')+', '('+D+A+B+')+', 
           '('+B+C+')+', '('+D+B+C+')+', '('+B+A+B+')+', '('+B+B+C+')+'] 


def extract_phrases(tag1, tag2, sentences):
    extract_phrase = []
    for sentence in sentences:
        phrase = []
        next_word = 0
        for word, pos in nltk.pos_tag(nltk.word_tokenize(sentence)):
            if next_word == 1:
                next_word = 0
                if pos == tag2:
                    extract_phrase = np.append(extract_phrase,phrase + ' ' + word) 
            
            if pos == tag1:
                next_word = 1
                phrase = word
    return extract_phrase

for i in cluster_name:
    File = open('./tweets_Cluster_'+str(i)+'.txt', 'r') #open file
    lines = File.read() #read all lines
    sentences = nltk.sent_tokenize(lines) #tokenize sentences

    for sentence in sentences: 
        f = nltk.pos_tag(nltk.word_tokenize(sentence))
        tag_seq = []
        for word, pos in f:
            tag_seq.append(pos+'/'+ word)
        X = " ".join(tag_seq)

        phrase = []
        for j in range(len(patterns)):
            if re.search(patterns[j], X):
                phrase.append(' '.join([word.split('/')[1] for word in re.search(patterns[j], X).group(0).split()]))
    
        k = pd.DataFrame({'extracted_phrases': np.unique(phrase), 'cluster_num': int(i)})
    
        phrases = pd.concat([phrases,k], ignore_index = True)

print(phrases)

            extracted_phrases  cluster_num
0              boom following          0.0
1                  mobile pos          0.0
2                    pos boom          0.0
3          pos boom following          0.0
4      dont give ideas indias          0.0
...                       ...          ...
15452                 2 weeks         10.0
15453          demonetization         10.0
15454           lines reduced         10.0
15455    speech lines reduced         10.0
15456        thoughts 2 weeks         10.0

[15457 rows x 2 columns]


### Keeping the largest phrase

In [39]:
#For each phrase identified replace all the substrings by the largest phrase 
#Ex: lakh looted,40 lakh looted and Rs 40 lakh looted, replace all by single largest phrase - Rs 40 lakh looted 
#i.e. instead of 3 different phrases, there will be only one large phrase

phrases_final = pd.DataFrame({'extracted_phrases': [], 'cluster_num': []})
for i in cluster_name:
    phrases_for_each_cluster = []
    cluster_phrases = phrases['extracted_phrases'][phrases.cluster_num == i]
    cluster_phrases = np.unique(np.array(cluster_phrases))
    for j in range(len(cluster_phrases)):
        
        phrase = cluster_phrases[j]
        updated_cluster_phrases = np.delete((cluster_phrases), j)
        if any(phrase in phr for phr in updated_cluster_phrases): 
            'y'
        else: 
            #considering phrases of length greater than 1 only
            if (len(phrase.split(' '))) > 1:
                phrases_for_each_cluster.append(phrase)
    k = pd.DataFrame({'extracted_phrases': phrases_for_each_cluster, 'cluster_num': int(i) })
    
    phrases_final = pd.concat([phrases_final,k], ignore_index = True)

In [40]:
phrases_final

Unnamed: 0,extracted_phrases,cluster_num
0,007 economictimes mamataofficial mom,0.0
1,01 deposits indias banks,0.0
2,05012017but till,0.0
3,06 bharat,0.0
4,1 end black money 2 end,0.0
...,...,...
7322,withdraw cash,10.0
7323,withdraw money,10.0
7324,withdrawal restrictions were,10.0
7325,withdrawals allowed,10.0


In [41]:
phrases_final.columns

Index(['extracted_phrases', 'cluster_num'], dtype='object')

### For each phrase in each cluster, calculate term frequency 

In [42]:
#Term-frequency : For each cluster, calculate the number of times a given phrase occur in the tweets of that cluster

phrases_final['term_freq'] = len(phrases_final)*[0]

for i in cluster_name:
    for phrase in phrases_final['extracted_phrases'][phrases_final.cluster_num == i]:
        tweets = dfUnique[tweets_to_consider][dfUnique.cl_num == i]
        for tweet in tweets:
            if phrase in tweet:
                phrases_final['term_freq'][(phrases_final.extracted_phrases == phrase) & (phrases_final.cluster_num == i)] = phrases_final['term_freq'][(phrases_final.extracted_phrases == phrase) & (phrases_final.cluster_num == i)] + 1

### For each phrase in each cluster, calculate document frequency

In [43]:
#Document-frequency
phrases_final['doc_freq'] = len(phrases_final)*[0]


# for each phrase, compute the number of clusters that Sphrase occurs in
for phrase in phrases_final['extracted_phrases']:
    for i in cluster_name:
        all_tweets = ''
        for tweet in dfUnique[tweets_to_consider][dfUnique.cl_num == i]:
            all_tweets = all_tweets + tweet + '. ' 
        if phrase in all_tweets:
            phrases_final['doc_freq'][(phrases_final.extracted_phrases == phrase) & (phrases_final.cluster_num == i)] = phrases_final['doc_freq'][(phrases_final.extracted_phrases == phrase) & (phrases_final.cluster_num == i)] + 1
        

In [44]:
import math
phrases_final['doc_freq'] = phrases_final['doc_freq'].apply(lambda x: math.log10(n_best_clusters/(x)) )

### For each phrase in each cluster, calculate tf-idf

In [45]:
phrases_final['tf-idf'] = phrases_final['term_freq']*phrases_final['doc_freq']

In [46]:
phrases_final

Unnamed: 0,extracted_phrases,cluster_num,term_freq,doc_freq,tf-idf
0,007 economictimes mamataofficial mom,0.0,1,1.041393,1.041393
1,01 deposits indias banks,0.0,1,1.041393,1.041393
2,05012017but till,0.0,1,1.041393,1.041393
3,06 bharat,0.0,1,1.041393,1.041393
4,1 end black money 2 end,0.0,2,1.041393,2.082785
...,...,...,...,...,...
7322,withdraw cash,10.0,2,0.564271,1.128543
7323,withdraw money,10.0,1,0.740363,0.740363
7324,withdrawal restrictions were,10.0,1,0.740363,0.740363
7325,withdrawals allowed,10.0,1,1.041393,1.041393


### For each cluster find top few phrases and respective sentiment
 

In [47]:
phrases_final['diff_tf-idf'] = len(phrases_final)*[0]

narrative = pd.DataFrame({'cl_num': [], 'abstraction': []})
for i in cluster_name: 
    # arrange in descending order of tf-idf score
    phrases_final = phrases_final.sort_values(['cluster_num','tf-idf'], ascending=[1,0])
    
    #Break this distribution at a point where the difference between any consecutive phrases is maximum
    #difference between consecutive values of tf-idf 
    phrases_final['diff_tf-idf'][phrases_final.cluster_num == i] = abs(phrases_final['tf-idf'][phrases_final.cluster_num == i] - phrases_final['tf-idf'][phrases_final.cluster_num == i].shift(1))

    #The last value for each cluster will be 'NaN'. Replacing it with '0'. 
    phrases_final = phrases_final.fillna(0)
    
    phrases_final = phrases_final.reset_index(drop = True) #to avoid old index being added as a new column
    if len(phrases_final[phrases_final.cluster_num == i]) != 0:
        
        #index corresponding to the highest difference
 
        ind = (phrases_final['diff_tf-idf'][phrases_final.cluster_num == i]).idxmax()
        
        abstract = phrases_final['extracted_phrases'][:ind+1][phrases_final.cluster_num == i]
    
    
        #store the abstraction corresponding to each cluster
        k = pd.DataFrame({'cl_num': int(i), 'abstraction': abstract})
        narrative = pd.concat([narrative,k], ignore_index = True)

In [48]:
dfUnique

Unnamed: 0,tweet,cleaned_tweet,fully_cleaned_tweet,sentiment,tokenized_tweet,cl_num,freq
2573,India's experiencing a mobile POS boom followi...,indias experiencing mobile pos boom following...,indias experiencing mobile pos boom following ...,0,"[indias, experiencing, mobile, pos, boom, foll...",0,1
4109,RT @ram2sun: Don't give ideas to India's wanna...,ram2sun dont give ideas indias wannabe mugab...,ram2sun dont give ideas indias wannabe mugabe ...,0,"[dont, give, ideas, indias, wannabe, mugabe, d...",0,5
4092,RT @priyankac19: Shri Modiji if you have jokes...,priyankac19 shri modiji you have jokes crac...,priyankac19 shri modiji you have jokes crack d...,0,"[shri, modiji, you, have, jokes, crack, demone...",0,1
4108,RT @rajeev_mp: Today again #demonetization deb...,rajeev today again demonetization debate p...,rajeev today again demonetization debate parlm...,0,"[rajeev, today, again, demonetization, debate,...",0,20
4107,RT @rajeev_mp: Today a washout of #parliament....,rajeev today washout parliament cudnt sp...,rajeev today washout parliament cudnt spk bcoz...,0,"[rajeev, today, washout, parliament, cudnt, sp...",0,1
...,...,...,...,...,...,...,...
1646,A maligned perspective by @mihirssharma must b...,maligned perspective mihirssharma must stu...,maligned perspective mihirssharma must stung d...,0,"[maligned, perspective, mihirssharma, must, st...",10,1
142,#Demonetization Cash Crunch in #Andaman Banks ...,demonetization cash crunch andaman banks ca...,demonetization cash crunch andaman banks causi...,0,"[demonetization, cash, crunch, andaman, banks,...",10,1
3286,RT @LivingOnChi: 1/21/17 Demonetization: The S...,livingonchi 12117 demonetization the siniste...,livingonchi 12117 demonetization the sinister ...,0,"[livingonchi, demonetization, the, sinister, a...",10,1
3284,RT @LivingOnChi: .@zerohedge 1/21/17 Demonetiz...,livingonchi zerohedge 12117 demonetization ...,livingonchi zerohedge 12117 demonetization the...,0,"[livingonchi, zerohedge, demonetization, the, ...",10,3


In [49]:
#Assigning polarity based on the sentiment for each tweet 2=negative, 1=positive, 3=neutral
dfUnique['polarity'] = np.NaN
dfUnique['polarity'][dfUnique.sentiment == 0.5] = "3"
dfUnique['polarity'][dfUnique.sentiment == 1] = "1"
dfUnique['polarity'][dfUnique.sentiment == 0] = "2"

### Assign the sentiment to each extracted phrases
count the number of tweets, a phrase has occurred in positive, negative and neutral context. Assign the most occurred sentiment to the phrase

In [50]:
from collections import Counter

#find the highest occurring sentiment corresponding to each tweet
def find_mode(a):
    b = Counter(a).most_common(3)
    mode = []; c_max = 0
    for a,c in b:
        if c>c_max:
            c_max = c
        if c_max == c:
            mode.append(a)  
    print(mode)
    mode.sort()
    print(mode)
    
    ## if mode is 3&2 i.e. neutral and negative, assign the overall sentiment for that phrase as negative, 
    ## if mode is 3&1 i.e. neutral and positive, assign the overall sentiment for that phrase as positive,
    ## if mode is 2&1 i.e. negative and positive, assign the overall sentiment for that phrase as neutal, 
    ## if mode is 3&2&1 i.e. negative, positive and neutral, assign the overall sentiment for that phrase as neutral
    
    if len(mode) == 1:
        return mode[0]
    
    elif (len(mode) == 2) & (mode[1]=='3'):
        return mode[0]
    else:
        return 3
    
#1=>+ve 2=>-ve 3=>Neutral
narrative['expression'] = -1
dfUnique = dfUnique.reset_index(drop = True)
for i in cluster_name:
    tweets = dfUnique[tweets_to_consider][dfUnique.cl_num == i]
    abstracts = narrative['abstraction'][narrative.cl_num == i] 
    for abst in abstracts:
        sent = []
        for tweet, polarity in zip(dfUnique[tweets_to_consider][dfUnique.cl_num == i], dfUnique['polarity'][dfUnique.cl_num == i]):
            if abst in tweet:
                sent = np.append(sent, polarity)
        
        
        if len(sent)!=0:
            ## if mode is 3&2-2, 3&1-1, 2&1-3, 3&2&1 - 3
            senti = find_mode(sent)
            if senti == '2':
                sent_value = "Negative"
            elif senti == '1':
                sent_value = "Positive"
            else:
                sent_value = "Neutral"
            narrative['expression'][(narrative.abstraction == abst) & (narrative.cl_num == i)] = sent_value
        

['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['1']
['1']
['1']
['1']
['1']
['1']
['1']
['1']
['1']
['1']
['1']
['1']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['1']
['1']
['1']
['1']
['1']
['1']
['1']
['1']
['1']
['1']
['1']
['1']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']




# Save the narratives in excel file
 With each sheet in the file representing 1 narrative ( == 1 cluster)

In [51]:
cluster_name

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10], dtype=int32)

In [52]:
#sudo pip install xlwt
#sudo pip3 install openpyxl
from pandas import ExcelWriter

#Save the narratives in an excel file 

writer = pd.ExcelWriter('narrative.xlsx')
for i in cluster_name:
    df1 = pd.DataFrame(dfUnique[['tweet','freq']][dfUnique.cl_num == i]).sort_values(['freq'], ascending = [0])
    df1 = pd.DataFrame({'tweet': dfUnique['tweet'][dfUnique.cl_num == i], 'freq': dfUnique['freq'][dfUnique.cl_num == i]}) 
    df1 = df1.sort_values(['freq'], ascending = [0]) 
    #print(df1)
    df2 = pd.DataFrame({ 'abstraction': narrative['abstraction'][narrative.cl_num == i], 'expression': narrative['expression'][narrative.cl_num == i]})
    df3 = pd.DataFrame({'abstraction': (len(df1)-len(df2))*['-'], 'expression': (len(df1)-len(df2))*['-']})

    #print(df3)
    
    #df2 = df2.append(df3)
    df2 = pd.concat([df2,df3])

    df1 = df1.reset_index(drop=True)
    df2 = df2.reset_index(drop=True)
    df1['abstraction'] = df2['abstraction']
    df1['expression'] = df2['expression']

    df1.to_excel(writer,'narrative_cluster'+str(i))

writer.close()
    

In [53]:
narrative

Unnamed: 0,cl_num,abstraction,expression
0,0.0,attempt cashless turning food vouchers digital,Negative
1,0.0,demonetization dea secy dasshaktikanta,Negative
2,0.0,digital payments,Negative
3,0.0,indias attempt,Negative
4,0.0,payments growth india,Negative
5,0.0,demonetization time,Negative
6,1.0,youtube video,Positive
7,1.0,people are,Positive
8,2.0,baseless young turks,Positive
9,2.0,turks get,Positive
