In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, Lasso
from sklearn.decomposition import PCA

In [2]:
def bucket(core_periphery):
    if core_periphery > 2 and core_periphery <= 4:
        return 3
    elif core_periphery > 4 and core_periphery <= 7:
        return 4
    elif core_periphery > 7:
        return 5
    else:
        return core_periphery

# DT

In [3]:
df = pd.read_pickle('Data/Processed Data/ML/DT-ml.pkl')

In [67]:
X = df[['count', 'headline_count', 'text_count']]
y = df['core_periphery']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify=y)

In [68]:
reg = LinearRegression()
reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [69]:
y_pred = reg.predict(X_test)

In [70]:
print("R^2: {}".format(reg.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {}".format(rmse))

R^2: -0.2737323291072866
Root Mean Squared Error: 1.5929200809333213


In [74]:
reg = LinearRegression()

# Perform 5-fold CV
cvscores_5 = cross_val_score(reg, X, y, cv=5)
print(np.mean(cvscores_5))

0.05046988280888387


In [78]:
ridge = Ridge(alpha=0.1, normalize=True)
ridge.fit(X_train, y_train)
ridge.score(X_test, y_test)

-0.2475348233840593

In [79]:
lasso = Lasso(alpha=0.1, normalize=True)
lasso.fit(X_train, y_train)
lasso.score(X_test, y_test)

-1.2241643232435706e-07

#### Changing cores to buckets

In [80]:
df.loc[df['core_periphery'] > 4, 'core_periphery'] = 5

In [81]:
X = df[['count', 'headline_count', 'text_count']]
y = df['core_periphery']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify=y)

In [82]:
reg = LinearRegression()
reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [83]:
y_pred = reg.predict(X_test)

In [84]:
print("R^2: {}".format(reg.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {}".format(rmse))

R^2: -0.20896204394995022
Root Mean Squared Error: 1.357439850993599


In [85]:
reg = LinearRegression()

# Perform 5-fold CV
cvscores_5 = cross_val_score(reg, X, y, cv=5)
print(np.mean(cvscores_5))

0.021403563918336886


In [86]:
ridge = Ridge(alpha=0.1, normalize=True)
ridge.fit(X_train, y_train)
ridge.score(X_test, y_test)

-0.1908623731047372

In [87]:
lasso = Lasso(alpha=0.1, normalize=True)
lasso.fit(X_train, y_train)
lasso.score(X_test, y_test)

-3.3276200595011574e-07

# DS

In [88]:
df = pd.read_pickle('Data/Processed Data/ML/DS-ml.pkl')

In [89]:
X = df[['count', 'headline_count', 'text_count']]
y = df['core_periphery']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify=y)

In [90]:
reg = LinearRegression()
reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [91]:
y_pred = reg.predict(X_test)

In [92]:
print("R^2: {}".format(reg.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.031043966400095654
Root Mean Squared Error: 3.767614059572202


In [93]:
reg = LinearRegression()

# Perform 5-fold CV
cvscores_5 = cross_val_score(reg, X, y, cv=5)
print(np.mean(cvscores_5))

0.11122115714892863


In [94]:
ridge = Ridge(alpha=0.1, normalize=True)
ridge.fit(X_train, y_train)
ridge.score(X_test, y_test)

0.07771027958486343

In [95]:
lasso = Lasso(alpha=0.1, normalize=True)
lasso.fit(X_train, y_train)
lasso.score(X_test, y_test)

-6.326378199084104e-08

#### Changing cores to buckets

In [41]:
df = pd.read_pickle('Data/Processed Data/ML/DS-ml.pkl')

df['core_periphery'] = df['core_periphery'].apply(bucket)

In [42]:
X = df[['count', 'headline_count', 'text_count']]
y = df['core_periphery']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify=y)

In [43]:
reg = LinearRegression()
reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [44]:
y_pred = reg.predict(X_test)

In [45]:
print("R^2: {}".format(reg.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.07019883240017444
Root Mean Squared Error: 1.3061066305903795


In [96]:
reg = LinearRegression()

# Perform 5-fold CV
cvscores_5 = cross_val_score(reg, X, y, cv=5)
print(np.mean(cvscores_5))

0.11122115714892863


In [97]:
ridge = Ridge(alpha=0.1, normalize=True)
ridge.fit(X_train, y_train)
ridge.score(X_test, y_test)

0.07771027958486343

In [98]:
lasso = Lasso(alpha=0.1, normalize=True)
lasso.fit(X_train, y_train)
lasso.score(X_test, y_test)

-6.326378199084104e-08

# NA

In [114]:
df = pd.read_pickle('Data/Processed Data/ML/NA-ml.pkl')

In [115]:
X = df[['count', 'headline_count', 'text_count']]
y = df['core_periphery']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify=y)

In [116]:
reg = LinearRegression()
reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [117]:
y_pred = reg.predict(X_test)

In [109]:
print("R^2: {}".format(reg.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.07874407739652711
Root Mean Squared Error: 3.9132775339149433


In [110]:
reg = LinearRegression()

# Perform 5-fold CV
cvscores_5 = cross_val_score(reg, X, y, cv=5)
print(np.mean(cvscores_5))

0.04446857039748133


In [111]:
ridge = Ridge(alpha=0.1, normalize=True)
ridge.fit(X_train, y_train)
ridge.score(X_test, y_test)

0.07461019540368463

In [112]:
lasso = Lasso(alpha=0.1, normalize=True)
lasso.fit(X_train, y_train)
lasso.score(X_test, y_test)

-4.0767910669536894e-07

#### Changing cores to buckets

In [52]:
df = pd.read_pickle('Data/Processed Data/ML/NA-ml.pkl')

df['core_periphery'] = df['core_periphery'].apply(bucket)

In [53]:
X = df[['count', 'headline_count', 'text_count']]
y = df['core_periphery']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify=y)

In [54]:
reg = LinearRegression()
reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [55]:
y_pred = reg.predict(X_test)

In [56]:
print("R^2: {}".format(reg.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.05924100598517956
Root Mean Squared Error: 1.347206418807225


In [102]:
reg = LinearRegression()

# Perform 5-fold CV
cvscores_5 = cross_val_score(reg, X, y, cv=5)
print(np.mean(cvscores_5))

0.11122115714892863


In [103]:
ridge = Ridge(alpha=0.1, normalize=True)
ridge.fit(X_train, y_train)
ridge.score(X_test, y_test)

0.07771027958486343

In [104]:
lasso = Lasso(alpha=0.1, normalize=True)
lasso.fit(X_train, y_train)
lasso.score(X_test, y_test)

-6.326378199084104e-08

## PCA and Correlation

In [70]:
df = pd.read_pickle('Data/Processed Data/ML/DT-ml.pkl')

X = df[['count', 'headline_count', 'text_count']]
y = df['core_periphery']

In [71]:
pca = PCA(n_components=1)

X = np.ravel(pca.fit_transform(X))
y = y.as_matrix()

In [72]:
corr = np.corrcoef(X, y)

In [73]:
corr

array([[1.        , 0.33404725],
       [0.33404725, 1.        ]])

In [74]:
df.drop(['pagerank', 'degree_centrality'], axis=1, inplace=True)
df.corr(method='pearson')

Unnamed: 0,core_periphery,count,headline_count,text_count
core_periphery,1.0,0.280397,0.317273,0.347796
count,0.280397,1.0,0.275348,0.947973
headline_count,0.317273,0.275348,1.0,0.505956
text_count,0.347796,0.947973,0.505956,1.0


In [75]:
df = pd.read_pickle('Data/Processed Data/ML/DS-ml.pkl')

X = df[['count', 'headline_count', 'text_count']]
y = df['core_periphery']

In [76]:
pca = PCA(n_components=1)

X = np.ravel(pca.fit_transform(X))
y = y.as_matrix()

In [77]:
corr = np.corrcoef(X, y)

In [78]:
corr

array([[1.        , 0.34784551],
       [0.34784551, 1.        ]])

In [79]:
df.drop(['pagerank', 'degree_centrality'], axis=1, inplace=True)
df.corr(method='pearson')

Unnamed: 0,core_periphery,count,headline_count,text_count
core_periphery,1.0,0.328125,0.249612,0.341398
count,0.328125,1.0,0.424225,0.837756
headline_count,0.249612,0.424225,1.0,0.75075
text_count,0.341398,0.837756,0.75075,1.0


In [65]:
df = pd.read_pickle('Data/Processed Data/ML/NA-ml.pkl')

X = df[['count', 'headline_count', 'text_count']]
y = df['core_periphery']

In [66]:
pca = PCA(n_components=1)

X = np.ravel(pca.fit_transform(X))
y = y.as_matrix()

In [67]:
corr = np.corrcoef(X, y)

In [68]:
corr

array([[1.        , 0.25593588],
       [0.25593588, 1.        ]])

In [69]:
df.drop(['pagerank', 'degree_centrality'], axis=1, inplace=True)
df.corr(method='pearson')

Unnamed: 0,core_periphery,count,headline_count,text_count
core_periphery,1.0,0.255063,0.139722,0.249954
count,0.255063,1.0,0.251324,0.888336
headline_count,0.139722,0.251324,1.0,0.528823
text_count,0.249954,0.888336,0.528823,1.0
