In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%load_ext autoreload
%autoreload 2

from settings import *
from utils import human_format

In [None]:
df = pd.read_csv(first_level_graph_file, sep='\t')

In [None]:
print('Tweets:', human_format(len(df['tweet'].unique())))
print('URLs:', human_format(len(df['out_url'])))
print('Tweets without URL:', human_format(len(df['out_url'][df['out_url'].isin(['http://TweetWithoutURL.org'])])))
print('HTTP Errors:', human_format(len(df['out_url'][df['out_url'].isin(['http://HTTPError.org'])])))
print('Timeout Errors:', human_format(len(df['out_url'][df['out_url'].isin(['http://TimeoutError.org'])])))

In [None]:
#Keep only year
df['date'] = df['date'].apply(lambda s : datetime.strptime(s, '%d.%m.%y %H:%M').year)

In [None]:
df1=df.copy()
df1['out_url'] = df1['out_url'].apply(lambda u: u if u in ['http://TweetWithoutURL.org', 'http://HTTPError.org', 'http://TimeoutError.org'] else 'http://WorkingURL.org')
df1[['tweet', 'date','out_url']].pivot_table(index='date', columns='out_url',aggfunc='count').T.reset_index(level=0, drop=True).T.fillna(1).plot(logy=True)

In [None]:
#cleaning
df = df[~df['out_url'].isin(['http://TweetWithoutURL.org', 'http://HTTPError.org', 'http://TimeoutError.org'])]
df['netloc'] = df.apply(lambda r: re.sub(r'^(http://)?(www\.)?', r'', '{0.netloc}'.format(urlsplit(r['out_url']))), axis=1)
#mostly photos and reposts
df = df[~df['netloc'].isin(['twitter.com', 'facebook.com', 'google.com'])]

In [None]:
print('Unique URLs:', human_format(len(df['out_url'].unique())))
print('Unique network locations:', human_format(len(df['netloc'].unique())))

# Institutions

In [None]:
inst = pd.read_csv(institutionsFile, sep='\t')
inst['URL'] = inst['URL'].apply(lambda u: re.sub(r'^(www[0-9]?\.)|(web\.)', r'', u))
def find_inst(netloc, inst):
    for i in inst:
        if i==netloc or i in netloc:
            return i
    return netloc

df['netloc'] = df['netloc'].apply(lambda r: find_inst(r, inst['URL']))

In [None]:
inst = df.merge(inst, left_on='netloc', right_on='URL')

In [None]:
inst.groupby('Institution').size().sort_values(ascending=False)[:10]

In [None]:
print('Most popular Institutions')
inst.groupby('Institution').mean()['popularity'].sort_values(ascending=False)[:10]

In [None]:
inst.groupby('Institution').mean().plot.scatter(x='Score', y='popularity')

In [None]:
corr = inst.groupby('Institution').mean()[['popularity', 'World Rank', 'National Rank', 'Alumni Employment', 'Publications', 'Influence', 'Citations', 'Broad Impact', 'Patents', 'Score']].corr()
#sns.heatmap(corr, xticklabels=corr.columns.values, yticklabels=corr.columns.values)
corr.iloc[0]

In [None]:
countries = pd.read_csv(countriesFile)
countries = inst[['user_country', 'Location']].merge(countries, left_on='user_country', right_on='Code')[['Name', 'Location']]
countries.loc[countries['Location'] == 'USA'] = 'United States'
countries['Name'] = countries['Name'].map(lambda n: n+'_user')
countries['Location'] = countries['Location'].map(lambda n: n+'_inst')

In [None]:
import networkx as nx
from networkx.algorithms import bipartite
B = nx.Graph()
B.add_edges_from([(row['Name'], row['Location']) for _, row in countries.iterrows()])

In [None]:
plt.figure(figsize=(10,10))
X, Y = bipartite.sets(B)
pos = dict()
pos.update( (n, (1, i)) for i, n in enumerate(X) ) # put nodes from X at x=1
pos.update( (n, (2, i*4)) for i, n in enumerate(Y) ) # put nodes from Y at x=2
nx.draw(B, pos=pos, with_labels = True)

# Repositories

In [None]:
repos = pd.read_csv(repositoriesFile)
repos['URL'] = repos['URL'].apply(lambda u: re.sub(r'^http://(www\.)?', r'', u))

In [None]:
def find_repo(netloc, repos):
    for i in repos:
        if i==netloc or i in netloc:
            return i
    return netloc

df['netloc'] = df['netloc'].apply(lambda r: find_repo(r, repos['URL']))
repos = df.merge(repos, left_on='netloc', right_on='URL')

In [None]:
repos['Name'].value_counts()

In [None]:
df = df[~df['tweet'].isin(inst['tweet'].tolist()+repos['tweet'].tolist())]

In [None]:
with open(second_level_urls_file, 'w') as f:
    #f.write('URL\tnetloc\n')
    for u in df['out_url'].unique():
        f.write(u+'\t'+re.sub(r'^(http://)?(www\.)?', r'', '{0.netloc}'.format(urlsplit(u)))+'\n')