In [2]:
import pandas as pd
import time
import numpy as np
from functools import reduce
from lib.util import fetch_tweets, to_unix_tmsp, fetch_X
import matplotlib.pyplot as plt

#latent-factor modeling:
from sklearn.decomposition import PCA,SparsePCA,KernelPCA
from sklearn.manifold import TSNE, Isomap

### Read CSV file for thread level features and separate is_rumor tag with data:

In [19]:
fn = "data/threads/germanwings-crash.csv"
gw_thrds=fetch_X(fn)
gw_thrds_rumortags=gw_thrds["is_rumor"]
gw_thrds_without_rumor_tag=gw_thrds.drop(['is_rumor'], axis=1)

print(gw_thrds_without_rumor_tag.columns.values)


['Unnamed: 0' 'thread' 'user.verified' 'thread_length' 'hashtags_count'
 'user.default_pic' 'urls_count' 'favorite_count' 'has_smile_emoji'
 'retweet_count' 'user.has_bg_img' 'user.tweets_count'
 'src.followers_count' 'src.listed_count' 'src.user_verified' 'created'
 'src.created_at' 'src.tweets_total' 'first_resp' 'last_resp' 'resp_var'
 'time_to_first_resp' 'time_to_last_resp']


### Used functions:

In [20]:
def convertTrueFalseTo01(X):
    X[X==True]=1.0
    X[X==False]=0.0
    #X[X=='True']=1.0
    #X[X=='False']=0.0
    return X

def standardize_cols(X, mu=None, sigma=None):
    # Standardize each column with mean 0 and variance 1
    n_rows, n_cols = X.shape

    if mu is None:
        mu = np.mean(X, axis=0)

    if sigma is None:
        sigma = np.std(X, axis=0)
        sigma[sigma < 1e-8] = 1.

    return (X - mu) / sigma, mu, sigma


### Data Preprocessing:

In [8]:
gw_thrds_values=gw_thrds_without_rumor_tag.values
n,d=gw_thrds_values.shape
gw_thrds_values=convertTrueFalseTo01(gw_thrds_values[:,1:d])
n,d=gw_thrds_values.shape

gw_thrds_rumortags_values=convertTrueFalseTo01(gw_thrds_rumortags.values)
gw_thrds_values,_,_=standardize_cols(gw_thrds_values.astype(float))

n,d=gw_thrds_values.shape
print(gw_thrds_values.shape)

(405, 22)


### PCA:

In [10]:
model=PCA(n_components=2)
model.fit(gw_thrds_values)
Z_PCA=model.transform(gw_thrds_values)
plt.figure()
plt.title("PCA")
plt.scatter(Z_PCA[:,0],Z_PCA[:,1],c=gw_thrds_rumortags_values)
plt.show()

### TSNE:

In [12]:
model=TSNE(n_components=2)
Z_TSNE=model.fit_transform(gw_thrds_values)
plt.figure()
plt.title("TSNE")
plt.scatter(Z_TSNE[:,0],Z_TSNE[:,1],c=gw_thrds_rumortags_values)
plt.show()

### Isomap:


In [14]:
model=Isomap(n_components=2,n_neighbors=4)
Z_Isomap=model.fit_transform(gw_thrds_values)
plt.figure()
plt.title("Isomap")
plt.scatter(Z_Isomap[:,0],Z_Isomap[:,1],c=gw_thrds_rumortags_values)
plt.show()

### SparsePCA:

In [16]:
model=SparsePCA(n_components=2,normalize_components=True)
model.fit(gw_thrds_values)
Z_PCA=model.transform(gw_thrds_values)
plt.figure()
plt.title("SparsePCA")
plt.scatter(Z_PCA[:,0],Z_PCA[:,1],c=gw_thrds_rumortags_values)
plt.show()

### KernelPCA:

In [18]:
model=KernelPCA(n_components=2)
model.fit(gw_thrds_values)
Z_PCA=model.transform(gw_thrds_values)
plt.figure()
plt.title("KernelPCA")
plt.scatter(Z_PCA[:,0],Z_PCA[:,1],c=gw_thrds_rumortags_values)
plt.show()