In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import os
import pickle
from collections import Counter, defaultdict
from itertools import combinations

import numpy as np
import scipy as sp
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import linalg
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# keep hindi and english
fulldata = pd.read_csv("../lite_data/india_July_21.csv", header=0)
fulldata.head()

Unnamed: 0.1,Unnamed: 0,domain,id,id_str,party,screen_name,state
0,0,1.0,784325894559059968,784325894559059968,To Be Added,Sanjay_Meshack,To Be Added
1,1,1.0,741171197342187521,741171197342187520,BJP,BJP4AnN,Andaman and Nicobar Islands
2,2,1.0,1283244990274002949,1283244990274002944,INC,NSubramanian1,To Be Added
3,3,1.0,935363557050298368,935363557050298368,INC,SatishYepuri,Andhra Pradesh
4,4,1.0,112384119,112384119,BJP,praveenkubjp,Bihar


In [3]:
fulldata.sample(5)

Unnamed: 0.1,Unnamed: 0,domain,id,id_str,party,screen_name,state
13656,13656,1.0,826514822900559877,826514822900559872,JKNC,altaf_kaloo,Jammu and Kashmir
6705,6705,1.0,1079351778468978688,1079351778468978688,BJP,AtulRai62538613,Uttar Pradesh
15110,15110,1.0,402711970,402711970,INC,kuriyedathu,Kerala
10299,10299,1.0,1026161689740369920,1026161689740369920,To Be Added,TSelja,Delhi
6661,6661,1.0,740057367522332672,740057367522332672,BJP,Rohittripathibj,Uttar Pradesh


In [4]:
fulldata.state.unique()

array(['To Be Added', 'Andaman and Nicobar Islands', 'Andhra Pradesh',
       'Bihar', 'Maharashtra', 'Tamil Nadu', 'Manipur', 'Meghalaya',
       'Gujarat', 'Mizoram', 'Nagaland', 'Odisha', 'Overseas',
       'Puducherry', 'Chhattisgarh', 'Madhya Pradesh', 'Punjab',
       'Rajasthan', 'Delhi', 'Uttar Pradesh', 'Arunachal Pradesh',
       'Karnataka', 'Haryana', 'Uttarakhand', 'Telangana',
       'Himachal Pradesh', 'West Bengal', 'Tripura', 'Chandigarh',
       'Dadra and Nagar Haveli', 'Sikkim', 'Jammu and Kashmir',
       'Daman and Diu', 'Goa', 'Ladakh', 'Assam', 'Lakshadweep',
       'Jharkhand', 'Kerala'], dtype=object)

In [5]:
svd_embedding_size = 20

In [6]:
data = pd.read_csv("../lite_data/jan15_extracted_domains.csv", header=0)
data.head()

Unnamed: 0.1,Unnamed: 0,author,domain,count
0,0,,1.si,1
1,1,,1950.today,1
2,2,,4lakhdenahoga.com,2
3,3,,54.xn--h2brj9c,1
4,4,,5dariyanews.com,1


In [None]:
data1 = pd.read_csv("../lite_data/jan21_extracted_domains.csv", header=0)
data1.head()

In [None]:
data = pd.concat([data, data1])
data = data.dropna(subset=['author'])

In [None]:
news_domains = pd.read_csv("../lite_data/domaintitles_annotated_no_international.csv", header=0)
news_domains.head()

In [None]:
newslist = news_domains.loc[news_domains['international']=='domestic']['domain'].tolist()

In [None]:
print(newslist)

In [None]:
data = data.dropna(subset=['author'])
data = data.loc[data['domain'].isin(newslist)]

In [None]:
len(data)

In [None]:
data.head()

In [None]:
authornames = list(set(data['author'].tolist()))
authornames = sorted(authornames)
print(authornames[:20])

In [None]:
decimal_authors = []
for a in authornames:
    if a.isdecimal():
        decimal_authors.append(a)
        
print(decimal_authors)

In [None]:
data = data.loc[~data['author'].isin(decimal_authors)]

In [None]:
author_totals = data.groupby('author').size().reset_index().rename(columns={0:'authorcontrib'})

In [None]:
author2contrib = author_totals.set_index('author').to_dict()['authorcontrib']

In [None]:
data = data.groupby(['author','domain']).size().reset_index().rename(columns={0:'domaincontrib'})

In [None]:
data.head()

In [None]:
data['domain_prop'] = data.apply(lambda row: float(row['domaincontrib'])/float(author2contrib[row['author']]), axis=1)

In [None]:
data.head()

In [None]:
formatted_data = pd.pivot_table(data, values='domaincontrib', index='domain', columns='author').reset_index()
formatted_data.head()

In [None]:
formatted_data = formatted_data.fillna(0.0)

In [None]:
formatted_data.head()

In [None]:
authorlist = list(formatted_data)[1:]

In [None]:
countMat = formatted_data[authorlist].values
countMat.shape

In [None]:
perauthor = data.groupby('author')['domain'].apply(list).reset_index()
perauthor.head()

In [None]:
perauthor['domain'] = perauthor['domain'].apply(lambda x: list(set(x)))
perauthor.head()

In [None]:
domainIndex = formatted_data['domain'].tolist()

In [None]:
countMat = np.zeros((len(domainIndex), len(domainIndex)), dtype=np.float)

In [None]:
countMat.shape

In [None]:
for idx, row in perauthor.iterrows():
    domainlist = row['domain']
    if len(domainlist)>1:
        for i in range(len(domainlist)-1):
            i_index = domainIndex.index(domainlist[i])
            countMat[i_index, i_index]+=1
            for j in range(i+1, len(domainlist)):
                i_index = domainIndex.index(domainlist[i])
                j_index = domainIndex.index(domainlist[j])
                countMat[i_index, j_index]+=1
                countMat[j_index, i_index]+=1
        countMat[j_index, j_index]+=1
                

In [None]:
countMat

In [None]:
def get_pmi_matrix(count_matrix, alpha=0.50, positive_only=False):
    # for standard PPMI
    DD = count_matrix.sum()
    sum_over_contexts = np.array(count_matrix.sum(axis=1)).flatten()
    sum_over_words = np.array(count_matrix.sum(axis=0)).flatten()

    # for context distribution smoothing (cds)
    sum_over_words_alpha = sum_over_words ** alpha
    Pc_alpha_denom = np.sum(sum_over_words_alpha)

    row_indxs = []
    col_indxs = []
    pmi_dat_values = []  # pointwise mutual information

    for idxs in tqdm(
            zip(*count_matrix.nonzero()),
            total=count_matrix.nnz,
            desc='building ppmi matrix row,col,dat'
    ):
        (tok_word_indx, tok_context_indx) = idxs
        pound_wc = count_matrix[tok_word_indx, tok_context_indx]
        pound_w = sum_over_contexts[tok_word_indx]
        pound_c_alpha = sum_over_words_alpha[tok_context_indx]

        Pwc = pound_wc / DD
        Pw = pound_w / DD
        Pc_alpha = pound_c_alpha / Pc_alpha_denom

        pmi = np.log2(Pwc / (Pw * Pc_alpha))
        if positive_only:
            pmi = max(pmi, 0)
        # if not np.isfinite(pmi):
        #     pmi = 0

        row_indxs.append(tok_word_indx)
        col_indxs.append(tok_context_indx)
        pmi_dat_values.append(pmi)

    print('building ppmi matrix')
    return sparse.csr_matrix((pmi_dat_values, (row_indxs, col_indxs)))

In [None]:
PMImat = get_pmi_matrix(sparse.csr_matrix(countMat))

In [None]:
def get_svd_matrix(pmi_matrix, embedding_size=200,
                   svd_diag_exponent=0.5):
    uu, ss, vv = linalg.svds(pmi_matrix, embedding_size)

    print('ppmi size: {}'.format(pmi_matrix.shape))
    print('embedding size: {}'.format(embedding_size))
    print('uu.shape: {}'.format(uu.shape))
    print('ss.shape: {}'.format(ss.shape))
    print('vv.shape: {}'.format(vv.shape))

    svd_word_vecs = uu.dot(np.diag(ss ** svd_diag_exponent))
    print(svd_word_vecs.shape)
    return svd_word_vecs

In [None]:
svdMat = get_svd_matrix(PMImat, embedding_size=svd_embedding_size)

In [None]:
svdMat.shape

In [None]:
domain_embeddings = pd.DataFrame()
domain_embeddings['domain'] = domainIndex
for i in range(svd_embedding_size):
    domain_embeddings[i] = svdMat[:,i]

In [None]:
domain_embeddings.head()

In [None]:
#domain_embeddings.to_csv("../big_data/domain_pmi_svd.csv")

## do Kmeans

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np

In [None]:
X = svdMat
N_OPTIONS = [2,3,4,5,6,7,8,9,10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

In [None]:
sils = []
for n_clusters in N_OPTIONS:
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(X)
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters =",n_clusters,"The average silhouette_score is :",silhouette_avg,)
    sils.append(silhouette_avg)


In [None]:
plt.plot(range(2, 21), sils)

In [None]:
clusterer = KMeans(n_clusters=10, random_state=15)
y_hat = clusterer.fit_predict(X)
domain_embeddings['labels'] = y_hat

In [None]:
domain_embeddings.head()

## TSN-E viz

In [None]:
import numpy as np
from sklearn.manifold import TSNE

X_embedded = TSNE(n_components=2, learning_rate='auto',
                  init='random').fit_transform(X)
X_embedded.shape


In [None]:
domain_embeddings['x'] = X_embedded[:,0]
domain_embeddings['y'] = X_embedded[:,1]

In [None]:
domain_embeddings.head()

In [None]:
import seaborn as sns

In [None]:
plt.figure(figsize=(7,5))
sns.scatterplot(x='x', y='y', hue='labels', data=domain_embeddings)

In [None]:
from bokeh.models import HoverTool, LabelSet
from bokeh.plotting import figure, show, ColumnDataSource, output_file
from bokeh.models import Legend
from bokeh.models import CategoricalColorMapper
#CategoricalColorMapper(palette=["red", "blue"], factors=["foo", "bar"])
from bokeh.transform import factor_cmap


domain_embeddings['str_label'] = domain_embeddings['labels'].apply(lambda x: str(x))

# from bokeh.models import ColumnDataSource, Range1d, LabelSet, Label
# color_map = CategoricalColorMapper(palette=[0,1,2,3], factors=['#FF0000','#FF8000','#00FF00','#0080FF'])
#color_map = {0:'#FF0000', 1:'#FF8000', 2:'#00FF00', 3:'#0080FF'}
x = X_embedded[:,0]
y = X_embedded[:,1]
author_names = domainIndex

# # Radius of each point corresponds to the number of documents attributed to that author.
# scale = 0.1
# author_sizes = cooccurrences.sum(axis=1).getA1()
# radii = [np.log(size) * scale for size in author_sizes]

source = ColumnDataSource(
        data=dict(
            x=x,
            y=y,
            author_names=author_names,
#             author_sizes=author_sizes,
            author_types=domain_embeddings['str_label'].tolist(),
#             radii=radii,
        )
    )

# Add author names and sizes to mouse-over info.
hover = HoverTool(
        tooltips=[
        ("author", "@author_names"),
        #("size", "@author_sizes"),
        ("type", "@author_types"),
        ]
    )

index_cmap = factor_cmap(field_name='author_types',palette=['#FF0000','#FF8000','#00FF00','#0080FF'], 
                         factors=['0','1','2','3'])


p = figure(tools=[hover, 'crosshair,pan,wheel_zoom,box_zoom,reset,save,lasso_select'], plot_width=500, plot_height=500)
p.scatter('x', 'y', source=source,
          #fill_color=index_cmap,size=10,
          color=index_cmap, size=10,
          fill_alpha=0.8, line_color=None)

output_file("foo.html")
show(p)

In [None]:
domain_embeddings.loc[domain_embeddings['labels']==1].head(50)