In [1]:
import pandas as pd 
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

from textblob import TextBlob

## Reading and understanding the data

In [2]:
df = pd.read_csv('demonetization-tweets.csv',encoding='ISO-8859-1')
df.head()

Unnamed: 0.1,Unnamed: 0,X,text,favorited,favoriteCount,replyToSN,created,truncated,replyToSID,id,replyToUID,statusSource,screenName,retweetCount,isRetweet,retweeted
0,1,1,RT @rssurjewala: Critical question: Was PayTM ...,False,0,,2016-11-23 18:40:30,False,,8.014957e+17,,"<a href=""http://twitter.com/download/android"" ...",HASHTAGFARZIWAL,331,True,False
1,2,2,RT @Hemant_80: Did you vote on #Demonetization...,False,0,,2016-11-23 18:40:29,False,,8.014957e+17,,"<a href=""http://twitter.com/download/android"" ...",PRAMODKAUSHIK9,66,True,False
2,3,3,"RT @roshankar: Former FinSec, RBI Dy Governor,...",False,0,,2016-11-23 18:40:03,False,,8.014955e+17,,"<a href=""http://twitter.com/download/android"" ...",rahulja13034944,12,True,False
3,4,4,RT @ANI_news: Gurugram (Haryana): Post office ...,False,0,,2016-11-23 18:39:59,False,,8.014955e+17,,"<a href=""http://twitter.com/download/android"" ...",deeptiyvd,338,True,False
4,5,5,RT @satishacharya: Reddy Wedding! @mail_today ...,False,0,,2016-11-23 18:39:39,False,,8.014954e+17,,"<a href=""http://cpimharyana.com"" rel=""nofollow...",CPIMBadli,120,True,False


In [3]:
df.shape

(14940, 16)

In [4]:
# Checking is there is any missing valaues in tweet
df['text'].isnull().sum()

0

## Cleaning the tweets

In [5]:
df = df['text']
df = pd.DataFrame({'tweet':df})
df.head()

Unnamed: 0,tweet
0,RT @rssurjewala: Critical question: Was PayTM ...
1,RT @Hemant_80: Did you vote on #Demonetization...
2,"RT @roshankar: Former FinSec, RBI Dy Governor,..."
3,RT @ANI_news: Gurugram (Haryana): Post office ...
4,RT @satishacharya: Reddy Wedding! @mail_today ...


In [6]:
# Removing few characters 
df['cleaned_tweet'] = df['tweet'].replace(r'\'|\"|\,|\.|\?|\+|\-|\/|\=|\(|\)|\n|"', '', regex=True)
# Replacing few double spaces with single space
df['cleaned_tweet'] = df['cleaned_tweet'].replace("  ", " ")

# remove emoticons form the tweets
df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'<ed>','', regex = True)
df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'\B<U+.*>|<U+.*>\B|<U+.*>','', regex = True)

# convert tweets to lowercase
df['cleaned_tweet'] = df['cleaned_tweet'].str.lower()
    
#remove user mentions
df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'^(@\w+)',"", regex=True)
    
#remove 'rt' in the beginning
df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'^(rt @)',"", regex=True)
    
#remove_symbols
df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'[^a-zA-Z0-9]', " ", regex=True)

#remove punctuations 
df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'[[]!"#$%\'()\*+,-./:;<=>?^_`{|}]+',"", regex = True)

#remove_URL(x):
df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'https.*$', "", regex = True)

#remove 'amp' in the text
df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'amp',"", regex = True)

#remove words of length 1 or 2 
df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'\b[a-zA-Z]{1,2}\b','', regex=True)

#remove extra spaces in the tweet
df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'^\s+|\s+$'," ", regex=True)



  re.compile(obj)


Now we have the cleaned_tweet. But the stop words are still present. We can use this cleaned_tweet forcreating phrases and ranking the phrases from the tweets.

We will also remove the stop words and name the column as fully_cleaned_tweet. This will be used for clustering the sentiments.

In [7]:
# list of words to remove
words_to_remove = ["ax","i","you","edu","s","t","m","subject","can","lines","re","what", "there","all","we",
                "one","the","a","an","of","or","in","for","by","on","but","is","in","a","not","with","as",
                "was","if","they","are","this","and","it","have","has","from","at","my","be","by","not","that","to",
                "from","com","org","like","likes","so","said","from","what","told","over","more","other",
                "have","last","with","this","that","such","when","been","says","will","also","where","why",
                "would","today", "in", "on", "you", "r", "d", "u", "hw","wat", "oly", "s", "b", "ht", 
                "rt", "p","the","th", "n", "was", "via"]

#remove stopwords and words_to_remove
stop_words = set(stopwords.words('english'))
mystopwords = [stop_words, words_to_remove]

df['fully_cleaned_tweet'] = df['cleaned_tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in mystopwords]))

In [8]:
df.head()

Unnamed: 0,tweet,cleaned_tweet,fully_cleaned_tweet
0,RT @rssurjewala: Critical question: Was PayTM ...,rssurjewala critical question was paytm info...,rssurjewala critical question was paytm inform...
1,RT @Hemant_80: Did you vote on #Demonetization...,hemant 80 did you vote demonetization modi...,hemant 80 did you vote demonetization modi sur...
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",roshankar former finsec rbi governor cbdt ch...,roshankar former finsec rbi governor cbdt chai...
3,RT @ANI_news: Gurugram (Haryana): Post office ...,ani news gurugram haryana post office employ...,ani news gurugram haryana post office employee...
4,RT @satishacharya: Reddy Wedding! @mail_today ...,satishacharya reddy wedding mail today cart...,satishacharya reddy wedding mail today cartoon...


In [9]:
# Assigning sentiment for each tweet which ranges between -1 to +1
# -1.0 is a negative polarity 
# 1.0 is a positive
# 0 is a neutral polarity
df['sentiment'] = df['fully_cleaned_tweet'].apply(lambda x: TextBlob(x).sentiment.polarity)
df.head()

Unnamed: 0,tweet,cleaned_tweet,fully_cleaned_tweet,sentiment
0,RT @rssurjewala: Critical question: Was PayTM ...,rssurjewala critical question was paytm info...,rssurjewala critical question was paytm inform...,0.15
1,RT @Hemant_80: Did you vote on #Demonetization...,hemant 80 did you vote demonetization modi...,hemant 80 did you vote demonetization modi sur...,0.0
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",roshankar former finsec rbi governor cbdt ch...,roshankar former finsec rbi governor cbdt chai...,0.0
3,RT @ANI_news: Gurugram (Haryana): Post office ...,ani news gurugram haryana post office employ...,ani news gurugram haryana post office employee...,0.0
4,RT @satishacharya: Reddy Wedding! @mail_today ...,satishacharya reddy wedding mail today cart...,satishacharya reddy wedding mail today cartoon...,0.0


## Vectorize the tweets

In [10]:
# Creating tokens for each tweet
df['tokenized_tweet'] = df['fully_cleaned_tweet'].apply(word_tokenize)
df.head()

Unnamed: 0,tweet,cleaned_tweet,fully_cleaned_tweet,sentiment,tokenized_tweet
0,RT @rssurjewala: Critical question: Was PayTM ...,rssurjewala critical question was paytm info...,rssurjewala critical question was paytm inform...,0.15,"[rssurjewala, critical, question, was, paytm, ..."
1,RT @Hemant_80: Did you vote on #Demonetization...,hemant 80 did you vote demonetization modi...,hemant 80 did you vote demonetization modi sur...,0.0,"[hemant, 80, did, you, vote, demonetization, m..."
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",roshankar former finsec rbi governor cbdt ch...,roshankar former finsec rbi governor cbdt chai...,0.0,"[roshankar, former, finsec, rbi, governor, cbd..."
3,RT @ANI_news: Gurugram (Haryana): Post office ...,ani news gurugram haryana post office employ...,ani news gurugram haryana post office employee...,0.0,"[ani, news, gurugram, haryana, post, office, e..."
4,RT @satishacharya: Reddy Wedding! @mail_today ...,satishacharya reddy wedding mail today cart...,satishacharya reddy wedding mail today cartoon...,0.0,"[satishacharya, reddy, wedding, mail, today, c..."


In [11]:
#if a word has a digit, remove that word
df['tokenized_tweet'] = df['tokenized_tweet'].apply(lambda x: [y for y in x if not any(c.isdigit() for c in y)])

In [12]:
# Set values for various parameters
num_features = 100    # Word vector dimensionality                      
min_word_count = 1   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size   

In [13]:
# Initialize and train the model
from gensim.models import word2vec

model = word2vec.Word2Vec(df['tokenized_tweet'], 
                          workers=num_workers, 
                          size=num_features, 
                          min_count = min_word_count, 
                          window = context)

# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

### Find vector corresponding to each tweet
Take the average of all word vectors in a tweet. This way we will find the vector for each tweet.

In [14]:
import numpy as np

vocab = list(model.wv.vocab)

def sentence_vector(sentence, model):
    nwords = 0
    featureV = np.zeros(100, dtype="float32")
    for word in sentence:
        featureV = np.add(featureV, model[word])
        nwords = nwords + 1
        
    featureV = np.divide(featureV, nwords)
    return featureV

tweet_vector = df['tokenized_tweet'].apply(lambda x: sentence_vector(x, model))  

tweet_vector = tweet_vector.apply(pd.Series)

  if __name__ == '__main__':


In [15]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
tweet_vector = scaler.fit_transform(tweet_vector)
tweet_vector = pd.DataFrame(tweet_vector)
tweet_vector

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.586995,0.137819,0.531457,0.312949,0.830650,0.901688,0.965286,0.735264,0.509050,0.262754,...,0.591593,0.910386,0.218638,0.542599,0.312191,0.232980,0.683697,0.421863,0.186054,0.684754
1,0.724390,0.325382,0.878921,0.276815,0.702868,0.638475,0.739306,0.389464,0.269896,0.631197,...,0.633481,0.545899,0.203313,0.453147,0.551553,0.162899,0.480723,0.704685,0.556957,0.625343
2,0.475013,0.375936,0.724312,0.084726,0.679788,0.535971,0.674786,0.520929,0.454589,0.673113,...,0.754519,0.638544,0.109514,0.383003,0.603779,0.100743,0.592014,0.505857,0.419922,0.873062
3,0.313917,0.411736,0.646651,0.095491,0.581554,0.333667,0.704571,0.506270,0.459693,0.738550,...,0.679851,0.512142,0.181879,0.368066,0.591715,0.121478,0.634640,0.424555,0.382954,0.894702
4,0.462676,0.075641,0.712272,0.269145,0.749692,0.585750,0.784891,0.517085,0.435273,0.784821,...,0.411311,0.462005,0.348756,0.357264,0.623920,0.140539,0.493400,0.285758,0.390584,0.584178
5,0.585956,0.525033,0.540028,0.379618,0.414513,0.501727,0.507622,0.326875,0.517132,0.754114,...,0.485582,0.382546,0.332205,0.490023,0.503637,0.221841,0.402129,0.646367,0.574488,0.597955
6,0.219672,0.369896,0.748176,0.315878,0.673257,0.158895,0.707999,0.251716,0.300993,0.661514,...,0.794606,0.647051,0.122794,0.341090,0.727953,0.267382,0.751271,0.628278,0.579503,0.795391
7,0.721734,0.584356,0.587944,0.038963,0.549958,0.573875,0.773273,0.679498,0.333567,0.409956,...,0.773789,0.817617,0.251626,0.591635,0.126516,0.234182,0.769981,0.545425,0.533951,0.946603
8,0.626295,0.688058,0.511210,0.199063,0.405476,0.446781,0.582765,0.592306,0.507608,0.554312,...,0.634367,0.538257,0.263114,0.597932,0.405452,0.266083,0.591268,0.547136,0.500551,0.726054
9,0.487272,0.481814,0.589012,0.280836,0.565977,0.437079,0.612912,0.360973,0.477594,0.550213,...,0.603004,0.605234,0.271190,0.432122,0.530197,0.301987,0.641741,0.566306,0.569648,0.671736


### Add sentiment to the tweet vector

In [16]:
#Scale the 'sentiment' vector, as the sentiment varies from -1 to +1
def sentiment(x):
    if x < 0.04:
        return 0
    elif x > 0.04:
        return 1
    else:
        return 0.5

# Adding sentiment to the 100th dimension
tweet_vector[100] = df['sentiment'].apply(lambda x: sentiment(x))

tweet_vector

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,0.586995,0.137819,0.531457,0.312949,0.830650,0.901688,0.965286,0.735264,0.509050,0.262754,...,0.910386,0.218638,0.542599,0.312191,0.232980,0.683697,0.421863,0.186054,0.684754,1
1,0.724390,0.325382,0.878921,0.276815,0.702868,0.638475,0.739306,0.389464,0.269896,0.631197,...,0.545899,0.203313,0.453147,0.551553,0.162899,0.480723,0.704685,0.556957,0.625343,0
2,0.475013,0.375936,0.724312,0.084726,0.679788,0.535971,0.674786,0.520929,0.454589,0.673113,...,0.638544,0.109514,0.383003,0.603779,0.100743,0.592014,0.505857,0.419922,0.873062,0
3,0.313917,0.411736,0.646651,0.095491,0.581554,0.333667,0.704571,0.506270,0.459693,0.738550,...,0.512142,0.181879,0.368066,0.591715,0.121478,0.634640,0.424555,0.382954,0.894702,0
4,0.462676,0.075641,0.712272,0.269145,0.749692,0.585750,0.784891,0.517085,0.435273,0.784821,...,0.462005,0.348756,0.357264,0.623920,0.140539,0.493400,0.285758,0.390584,0.584178,0
5,0.585956,0.525033,0.540028,0.379618,0.414513,0.501727,0.507622,0.326875,0.517132,0.754114,...,0.382546,0.332205,0.490023,0.503637,0.221841,0.402129,0.646367,0.574488,0.597955,0
6,0.219672,0.369896,0.748176,0.315878,0.673257,0.158895,0.707999,0.251716,0.300993,0.661514,...,0.647051,0.122794,0.341090,0.727953,0.267382,0.751271,0.628278,0.579503,0.795391,0
7,0.721734,0.584356,0.587944,0.038963,0.549958,0.573875,0.773273,0.679498,0.333567,0.409956,...,0.817617,0.251626,0.591635,0.126516,0.234182,0.769981,0.545425,0.533951,0.946603,0
8,0.626295,0.688058,0.511210,0.199063,0.405476,0.446781,0.582765,0.592306,0.507608,0.554312,...,0.538257,0.263114,0.597932,0.405452,0.266083,0.591268,0.547136,0.500551,0.726054,1
9,0.487272,0.481814,0.589012,0.280836,0.565977,0.437079,0.612912,0.360973,0.477594,0.550213,...,0.605234,0.271190,0.432122,0.530197,0.301987,0.641741,0.566306,0.569648,0.671736,0


In [17]:
#Updating the 'sentiment' column in df also
df['sentiment'] = tweet_vector[100]

## Cluster the narratives [= opinions + expressions]

In [18]:
# List of indices of missing value rows
missing_row_indices = tweet_vector[tweet_vector.isnull().any(axis=1)].index.to_list()
print(missing_row_indices)

[416, 1232, 2082, 4912, 4915, 6106, 7651, 7705, 7894, 7940, 8069, 8191, 8312, 8332, 8333, 8337, 8385, 8695, 8808, 9259, 9795, 9948, 9952, 10098, 10308, 10486, 12400, 12593, 13285, 14023]


In [19]:
# Dropping the rows with any missing values
tweet_vector = tweet_vector.dropna()

In [20]:
# Dropping the same rows dropped in tweet_vector
df = df.drop(missing_row_indices)

In [21]:
# K-Means clustering of the tweet vectors

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples

range_n_clusters = [3, 4, 5, 6, 7, 8, 9, 10]
X = tweet_vector
n_best_clusters = 0
silhouette_best = 0

for n_clusters in range_n_clusters:
    
    # Initialize the clusterer with n_clusters value and a random generator
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(X)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
                                      
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)
    
    if silhouette_avg > silhouette_best:
        silhouette_best = silhouette_avg
        n_best_clusters = n_clusters

For n_clusters = 3 The average silhouette_score is : 0.19690291280291197
For n_clusters = 4 The average silhouette_score is : 0.20478526694796778
For n_clusters = 5 The average silhouette_score is : 0.20737350393008844
For n_clusters = 6 The average silhouette_score is : 0.2327237148199673
For n_clusters = 7 The average silhouette_score is : 0.24265551774044233
For n_clusters = 8 The average silhouette_score is : 0.2510624670248163
For n_clusters = 9 The average silhouette_score is : 0.25128856397739285
For n_clusters = 10 The average silhouette_score is : 0.25655590454390176


In [22]:
# Best number of cluster
n_best_clusters

10

In [23]:
# Clustering with n_best_clusters
clusterer = KMeans(n_clusters= n_best_clusters , random_state=10)
cluster_labels = clusterer.fit_predict(X)

In [24]:
clusters = np.unique(cluster_labels)  
print(clusters)

[0 1 2 3 4 5 6 7 8 9]


In [25]:
#Array of tweets, the corresponding cluster number, sentiment
finaldf = pd.DataFrame({'cl_num': cluster_labels,'fully_cleaned_tweet': df['fully_cleaned_tweet'], 'cleaned_tweet': df['cleaned_tweet'], 'tweet': df['tweet'],'sentiment': df['sentiment']})
finaldf = finaldf.sort_values(by=['cl_num'])
finaldf.head()

Unnamed: 0,cl_num,fully_cleaned_tweet,cleaned_tweet,tweet,sentiment
3022,0,centerofright demonetization other than the us...,centerofright demonetization other than the...,RT @centerofright: #DeMonetization - other tha...,0
8987,0,lionelmedia google and youtube are censoring t...,lionelmedia google and youtube are censoring ...,RT @LionelMedia: Google and YouTube are censor...,0
5404,0,shirishkunder minor inconvenience for major co...,shirishkunder minor inconvenience for major c...,"RT @ShirishKunder: ""Minor inconvenience"" for ""...",0
7468,0,drkumarvishwas and the oscar goes demonetization,drkumarvishwas and the oscar goes demonetiz...,"RT @DrKumarVishwas: And the Oscar goes to ""Mr....",0
7472,0,drkumarvishwas and the oscar goes demonetization,drkumarvishwas and the oscar goes demonetiz...,"RT @DrKumarVishwas: And the Oscar goes to ""Mr....",0


In [26]:
# Adding cluster numbers to df as well
df['cl_num'] = cluster_labels
df.head()

Unnamed: 0,tweet,cleaned_tweet,fully_cleaned_tweet,sentiment,tokenized_tweet,cl_num
0,RT @rssurjewala: Critical question: Was PayTM ...,rssurjewala critical question was paytm info...,rssurjewala critical question was paytm inform...,1,"[rssurjewala, critical, question, was, paytm, ...",7
1,RT @Hemant_80: Did you vote on #Demonetization...,hemant 80 did you vote demonetization modi...,hemant 80 did you vote demonetization modi sur...,0,"[hemant, did, you, vote, demonetization, modi,...",3
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",roshankar former finsec rbi governor cbdt ch...,roshankar former finsec rbi governor cbdt chai...,0,"[roshankar, former, finsec, rbi, governor, cbd...",5
3,RT @ANI_news: Gurugram (Haryana): Post office ...,ani news gurugram haryana post office employ...,ani news gurugram haryana post office employee...,0,"[ani, news, gurugram, haryana, post, office, e...",5
4,RT @satishacharya: Reddy Wedding! @mail_today ...,satishacharya reddy wedding mail today cart...,satishacharya reddy wedding mail today cartoon...,0,"[satishacharya, reddy, wedding, mail, today, c...",6


In [27]:
dfOrdered = pd.DataFrame(df)

#Compute how many times a tweet has been 'retweeted' - that is, how many rows in dfOrdered are identical
dfOrdered['tokenized_tweet'] = dfOrdered['tokenized_tweet'].apply(tuple)
dfUnique = dfOrdered.groupby(['tweet', 'cleaned_tweet', 'fully_cleaned_tweet', 'sentiment','tokenized_tweet', 'cl_num']).size().reset_index(name="freq")
dfUnique = dfUnique.sort_values(by=['cl_num'])

In [28]:
dfUnique['tokenized_tweet'] = dfUnique['tokenized_tweet'].apply(list)
dfOrdered['tokenized_tweet'] = dfOrdered['tokenized_tweet'].apply(list)

In [29]:
# We can see that there are around 5000 unique tweets
dfUnique.shape

(5118, 7)

# Calculate abstraction and expression for each narrative


* Abstraction: What the opinion is about, for e.g. an opinion on demonetisation can be 'about a topic' such as 'Digital India', corruption, PM Modi, etc.
* Expression: The 'sentiment' of the opinion, i.e. positive, negative or neutral.

Each cluster represents a narrative. In other words, people start supporting other people having similar opinions, and as a result, opinions turn into narratives.

In [30]:
#Store all tweets corresponding to each cluster in a file
for i in clusters:
    with open('./tweets_Cluster_'+str(i)+'.txt','w') as out:
        y = ''
        for x in dfUnique['fully_cleaned_tweet'][dfUnique.cl_num == i]:    
            y = y + x + '. '
        out.write(y)
        out.close()

In [31]:
#A combination of (Noun, adjective, cardinal number, foreign word and Verb) are being extracted now
#Extract chunks matching pattern. Patterns are:
#1) Noun phrase (2 or more nouns occurring together. Ex United states of America, Abdul Kalam etc)
#2) Number followed by Noun (Ex: 28 Terrorists, 45th President)
#3) Adjective followed by Noun (Ex: Economic impact, beautiful inauguration)
#4) Foreign word (Ex: Jallikattu, Narendra modi)
#5) Noun followed by Verb (Ex: Terrorists arrested)
#And a combination of all 5
        
import re
import nltk

phrases = pd.DataFrame({'extracted_phrases': [], 'cluster_num': []})


A = '(CD|JJ)/\w+\s'  #cd or jj
B = '(NN|NNS|NNP|NNPS)/\w+\s'  #nouns
C = '(VB|VBD|VBG|VBN|VBP|VBZ)/\w+\s' #verbs
D = 'FW/\w+\s'  #foreign word
patterns = ['('+A+B+')+', '('+D+B+')+','('+D+')+', '('+B+')+', '('+D+A+B+')+', 
           '('+B+C+')+', '('+D+B+C+')+', '('+B+A+B+')+', '('+B+B+C+')+'] 


def extract_phrases(tag1, tag2, sentences):
    extract_phrase = []
    for sentence in sentences:
        phrase = []
        next_word = 0
        for word, pos in nltk.pos_tag(nltk.word_tokenize(sentence)):
            if next_word == 1:
                next_word = 0
                if pos == tag2:
                    extract_phrase = np.append(extract_phrase,phrase + ' ' + word) 
            
            if pos == tag1:
                next_word = 1
                phrase = word
    return extract_phrase

for i in clusters:
    File = open('./tweets_Cluster_'+str(i)+'.txt', 'r') #open file
    lines = File.read() #read all lines
    sentences = nltk.sent_tokenize(lines) #tokenize sentences

    for sentence in sentences: 
        f = nltk.pos_tag(nltk.word_tokenize(sentence))
        tag_seq = []
        for word, pos in f:
            tag_seq.append(pos+'/'+ word)
        X = " ".join(tag_seq)

        phrase = []
        for j in range(len(patterns)):
            if re.search(patterns[j], X):
                phrase.append(' '.join([word.split('/')[1] for word in re.search(patterns[j], X).group(0).split()]))
    
        k = pd.DataFrame({'extracted_phrases': np.unique(phrase), 'cluster_num': int(i)})
    
        phrases = pd.concat([phrases,k], ignore_index = True)

print(phrases)

                                       extracted_phrases  cluster_num
0                                               isn time          0.0
1                                                   time          0.0
2                              lukewearechange news team          0.0
3                                 youtube demonetization          0.0
4                                          ads appearing          0.0
5                                               markdice          0.0
6                                      offensive youtube          0.0
7                                 system 11000th percent          0.0
8                                                  oscar          0.0
9                                             oscar goes          0.0
10                                               app war          0.0
11                                     nosylviaplath aap          0.0
12                                        demonetization          0.0
13                  

### Keeping the largest phrase

In [32]:
#For each phrase identified replace all the substrings by the largest phrase 
#Ex: lakh looted,40 lakh looted and Rs 40 lakh looted, replace all by single largest phrase - Rs 40 lakh looted 
#i.e. instead of 3 different phrases, there will be only one large phrase

phrases_final = pd.DataFrame({'extracted_phrases': [], 'cluster_num': []})
for i in clusters:
    phrases_for_each_cluster = []
    cluster_phrases = phrases['extracted_phrases'][phrases.cluster_num == i]
    cluster_phrases = np.unique(np.array(cluster_phrases))
    for j in range(len(cluster_phrases)):
        
        phrase = cluster_phrases[j]
        updated_cluster_phrases = np.delete((cluster_phrases), j)
        if any(phrase in phr for phr in updated_cluster_phrases): 
            'y'
        else: 
            #considering phrases of length greater than 1 only
            if (len(phrase.split(' '))) > 1:
                phrases_for_each_cluster.append(phrase)
    k = pd.DataFrame({'extracted_phrases': phrases_for_each_cluster, 'cluster_num': int(i) })
    
    phrases_final = pd.concat([phrases_final,k], ignore_index = True)

In [33]:
phrases_final

Unnamed: 0,extracted_phrases,cluster_num
0,10 pesos,0.0
1,2 weeks,0.0
2,823 people,0.0
3,accion indias demonetization,0.0
4,action taken,0.0
5,ads appearing,0.0
6,advertisers are,0.0
7,america sleeps,0.0
8,anandm25 shiva sena,0.0
9,app war,0.0


# Calculate TF-IDF score 

## For each phrase in each cluster, calculate term frequency

In [34]:
dfUnique.head()

Unnamed: 0,tweet,cleaned_tweet,fully_cleaned_tweet,sentiment,tokenized_tweet,cl_num,freq
0,""" Isnt it time for the demonetization of spir...",isn time for the demonetization spirituali...,isn time for the demonetization spirituality a...,1,"[isn, time, for, the, demonetization, spiritua...",0,1
3286,RT @Lukewearechange: my news team and I lost t...,lukewearechange news team and lost the majo...,lukewearechange news team and lost the majorit...,0,"[lukewearechange, news, team, and, lost, the, ...",0,19
3302,RT @MarkDice: The ads appearing on offensive Y...,markdice the ads appearing offensive youtube...,markdice the ads appearing offensive youtube v...,0,"[markdice, the, ads, appearing, offensive, you...",0,29
1751,"And the Oscar goes to ""Mr.<U+092D><U+093E><U+0...",and the oscar goes demonetization,and the oscar goes demonetization,0,"[and, the, oscar, goes, demonetization]",0,1
3353,RT @NoSylviaPlath: AAP CM and App PM war #demo...,nosylviaplath aap and app war demonetization,nosylviaplath aap and app war demonetization,0,"[nosylviaplath, aap, and, app, war, demonetiza...",0,1


In [35]:
#Term-frequency : For each cluster, calculate the number of times a given phrase occur in the tweets of that cluster

phrases_final['term_freq'] = len(phrases_final)*[0]

for i in clusters:
    for phrase in phrases_final['extracted_phrases'][phrases_final.cluster_num == i]:
        tweets = dfUnique['fully_cleaned_tweet'][dfUnique.cl_num == i]
        for tweet in tweets:
            if phrase in tweet:
                phrases_final['term_freq'][(phrases_final.extracted_phrases == phrase) & (phrases_final.cluster_num == i)] = phrases_final['term_freq'][(phrases_final.extracted_phrases == phrase) & (phrases_final.cluster_num == i)] + 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


### For each phrase in each cluster, calculate document frequency

In [36]:
#Document-frequency
phrases_final['doc_freq'] = len(phrases_final)*[0]


# for each phrase, compute the number of clusters that Sphrase occurs in
for phrase in phrases_final['extracted_phrases']:
    for i in clusters:
        all_tweets = ''
        for tweet in dfUnique['fully_cleaned_tweet'][dfUnique.cl_num == i]:
            all_tweets = all_tweets + tweet + '. ' 
        if phrase in all_tweets:
            phrases_final['doc_freq'][(phrases_final.extracted_phrases == phrase) & (phrases_final.cluster_num == i)] = phrases_final['doc_freq'][(phrases_final.extracted_phrases == phrase) & (phrases_final.cluster_num == i)] + 1
        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [37]:
# Calculate IDF
import math
phrases_final['doc_freq'] = phrases_final['doc_freq'].apply(lambda x: math.log10(n_best_clusters/(x)) )

### For each phrase in each cluster, calculate tf-idf

In [38]:
# Calculate TF-IDF
# TF X IDF
phrases_final['tf-idf'] = phrases_final['term_freq']*phrases_final['doc_freq']

In [39]:
phrases_final

Unnamed: 0,extracted_phrases,cluster_num,term_freq,doc_freq,tf-idf
0,10 pesos,0.0,1,1.000000,1.000000
1,2 weeks,0.0,2,0.522879,1.045757
2,823 people,0.0,1,1.000000,1.000000
3,accion indias demonetization,0.0,1,1.000000,1.000000
4,action taken,0.0,1,1.000000,1.000000
5,ads appearing,0.0,2,1.000000,2.000000
6,advertisers are,0.0,1,1.000000,1.000000
7,america sleeps,0.0,2,1.000000,2.000000
8,anandm25 shiva sena,0.0,1,1.000000,1.000000
9,app war,0.0,2,1.000000,2.000000


## For each cluster find top few phrases and respective sentiment

In [40]:
phrases_final['diff_tf-idf'] = len(phrases_final)*[0]

narrative = pd.DataFrame({'cl_num': [], 'abstraction': []})
for i in clusters: 
    # arrange in descending order of tf-idf score
    phrases_final = phrases_final.sort_values(['cluster_num','tf-idf'], ascending=[1,0])
    
    #Break this distribution at a point where the difference between any consecutive phrases is maximum
    #difference between consecutive values of tf-idf 
    phrases_final['diff_tf-idf'][phrases_final.cluster_num == i] = abs(phrases_final['tf-idf'][phrases_final.cluster_num == i] - phrases_final['tf-idf'][phrases_final.cluster_num == i].shift(1))

    #The last value for each cluster will be 'NaN'. Replacing it with '0'. 
    phrases_final = phrases_final.fillna(0)
    
    phrases_final = phrases_final.reset_index(drop = True) #to avoid old index being added as a new column
    if len(phrases_final[phrases_final.cluster_num == i]) != 0:
        
        #index corresponding to the highest difference
 
        ind = (phrases_final['diff_tf-idf'][phrases_final.cluster_num == i]).idxmax()
        
        abstract = phrases_final['extracted_phrases'][:ind+1][phrases_final.cluster_num == i]
    
    
        #store the abstraction corresponding to each cluster
        k = pd.DataFrame({'cl_num': int(i), 'abstraction': abstract})
        narrative = pd.concat([narrative,k], ignore_index = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [41]:
narrative

Unnamed: 0,cl_num,abstraction
0,0.0,demonetization searches
1,0.0,narendramodis app
2,0.0,question parliament
3,0.0,effects demonetization
4,0.0,demonetization fishermen kerala
5,0.0,indias coast
6,0.0,oscar goes
7,0.0,government demonetization say
8,1.0,implement demonetization
9,1.0,narendra modi


In [42]:
#Assigning polarity based on the sentiment for each tweet 2=negative, 1=positive, 3=neutral
dfUnique['polarity'] = np.NaN
dfUnique['polarity'][dfUnique.sentiment == 0.5] = "3"
dfUnique['polarity'][dfUnique.sentiment == 1] = "1"
dfUnique['polarity'][dfUnique.sentiment == 0] = "2"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


## Assign the sentiment to each extracted phrases
count the number of tweets, a phrase has occurred in positive, negative and neutral context. Assign the most occurred sentiment to the phrase

In [43]:
from collections import Counter

#find the highest occurring sentiment corresponding to each tweet
def find_mode(a):
    b = Counter(a).most_common(3)
    mode = []; c_max = 0
    for a,c in b:
        if c>c_max:
            c_max = c
        if c_max == c:
            mode.append(a)  
    print(mode)
    mode.sort()
    print(mode)
    
    ## if mode is 3&2 i.e. neutral and negative, assign the overall sentiment for that phrase as negative, 
    ## if mode is 3&1 i.e. neutral and positive, assign the overall sentiment for that phrase as positive,
    ## if mode is 2&1 i.e. negative and positive, assign the overall sentiment for that phrase as neutal, 
    ## if mode is 3&2&1 i.e. negative, positive and neutral, assign the overall sentiment for that phrase as neutral
    
    if len(mode) == 1:
        return mode[0]
    
    elif (len(mode) == 2) & (mode[1]=='3'):
        return mode[0]
    else:
        return 3
    
#1=>+ve 2=>-ve 3=>Neutral
narrative['expression'] = -1
dfUnique = dfUnique.reset_index(drop = True)
for i in clusters:
    tweets = dfUnique['fully_cleaned_tweet'][dfUnique.cl_num == i]
    abstracts = narrative['abstraction'][narrative.cl_num == i] 
    for abst in abstracts:
        sent = []
        for tweet, polarity in zip(dfUnique['fully_cleaned_tweet'][dfUnique.cl_num == i], dfUnique['polarity'][dfUnique.cl_num == i]):
            if abst in tweet:
                sent = np.append(sent, polarity)
        
        
        if len(sent)!=0:
            ## if mode is 3&2-2, 3&1-1, 2&1-3, 3&2&1 - 3
            senti = find_mode(sent)
            if senti == '2':
                sent_value = "Negative"
            elif senti == '1':
                sent_value = "Positive"
            else:
                sent_value = "Neutral"
            narrative['expression'][(narrative.abstraction == abst) & (narrative.cl_num == i)] = sent_value
        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['1']
['1']
['1', '2']
['1', '2']
['1']
['1']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['1']
['1']
['1']
['1']
['1']
['1']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['1']
['1']
['1']
['1']
['1']
['1']
['2']
['2']
['1']
['1']
['1']
['1']


In [44]:
narrative

Unnamed: 0,cl_num,abstraction,expression
0,0.0,demonetization searches,Negative
1,0.0,narendramodis app,Negative
2,0.0,question parliament,Negative
3,0.0,effects demonetization,Negative
4,0.0,demonetization fishermen kerala,Negative
5,0.0,indias coast,Negative
6,0.0,oscar goes,Negative
7,0.0,government demonetization say,Negative
8,1.0,implement demonetization,Negative
9,1.0,narendra modi,Positive


In [45]:
# Plotting the sentiments
import seaborn as sns
sns.countplot(narrative['expression'])

<matplotlib.axes._subplots.AxesSubplot at 0x1edbccc0>

In [47]:
from pandas import ExcelWriter

#Save the narratives in an excel file 

writer = pd.ExcelWriter('narrative.xlsx')
for i in clusters:
    df1 = pd.DataFrame(dfUnique[['tweet','freq']][dfUnique.cl_num == i]).sort_values(['freq'], ascending = [0])
    df1 = pd.DataFrame({'tweet': dfUnique['tweet'][dfUnique.cl_num == i], 'freq': dfUnique['freq'][dfUnique.cl_num == i]}) 
    df1 = df1.sort_values(['freq'], ascending = [0]) 

    df2 = pd.DataFrame({ 'abstraction': narrative['abstraction'][narrative.cl_num == i], 'expression': narrative['expression'][narrative.cl_num == i]})
    df3 = pd.DataFrame({'abstraction': (len(df1)-len(df2))*['-'], 'expression': (len(df1)-len(df2))*['-']})
    df2 = df2.append(df3)

    df1 = df1.reset_index(drop=True)
    df2 = df2.reset_index(drop=True)
    df1['abstraction'] = df2['abstraction']
    df1['expression'] = df2['expression']

    df1.to_excel(writer,'narrative_cluster'+str(i))

writer.save()
    