# Regression Analysis

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
# from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from statsmodels.formula.api import glm, ols
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn.metrics import r2_score
# from sklearn.cross_validation import KFold
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

In [None]:
# Creates the features
regress_data = pd.read_csv(open("all_features.csv", 'rU'), encoding='utf-8', engine='c')
regress_data = regress_data.drop('Unnamed: 0', 1)
regress_data = regress_data.drop('Unnamed: 0.1', 1)
regress_data = regress_data.drop('Avg_Tweets', 1)
regress_data = regress_data.drop('is_Series_C', 1)
regress_data = regress_data.drop('is_Series_D', 1)
regress_data = regress_data.drop('is_Series_B', 1)
regress_data = regress_data.drop('is_Series_A', 1)
regress_data['Series'] = [val[1:] for val in list(regress_data['Series'])]
regress_data['i_Series_A'] = 1 * (regress_data.Series == 'Series_A')
regress_data['i_Series_B'] = 1 * (regress_data.Series == 'Series_B')
regress_data['i_Series_C'] = 1 * (regress_data.Series == 'Series_C')
regress_data['i_Series_D'] = 1 * (regress_data.Series == 'Series_D')
regress_data = regress_data.replace([np.inf, -np.inf], 0)
regress_data.fillna(0, inplace=True)

# Break it into test and train
itrain, itest = train_test_split(range(regress_data.shape[0]), train_size=0.9)
mask=np.ones(regress_data.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask=(mask==1)
regress_train = regress_data[mask]
regress_test = regress_data[~mask]

# Break it into series A, series B, series C, series D
regress_test_A = regress_test[regress_test['Series'] == 'Series_A']
regress_train_A = regress_train[regress_train['Series'] == 'Series_A']
regress_test_B = regress_test[regress_test['Series'] == 'Series_B']
regress_train_B = regress_train[regress_train['Series'] == 'Series_B']
regress_test_C = regress_test[regress_test['Series'] == 'Series_C']
regress_train_C = regress_train[regress_train['Series'] == 'Series_C']
regress_test_D = regress_test[regress_test['Series'] == 'Series_D']
regress_train_D = regress_train[regress_train['Series'] == 'Series_D']

regress_train.head()

In [None]:
# Create stats for each of the funding rounds
def find_stats(funding, label):
    print('------------------------------------')
    print('Round: ' + label)
    amounts = funding['Series_Amount']
    print('Mean ' + str(np.mean(amounts)))
    print('StDev ' + str(np.std(amounts)))
    
    
# Helps visualize the log of the funding rounds
def create_log_plot(regress_train_A, regress_train_B, regress_train_C, regress_train_D):
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15,10))
    fig.tight_layout()
    round_title = ['Series A Funding Amounts', 'Series B Funding Amounts', 'Series C Funding Amounts', 'Series D Funding Amounts']
    rounds = [regress_train_A, regress_train_B, regress_train_C, regress_train_D]
    for ax, i in zip(axes.ravel(), range(len(rounds))):
        funding = np.log((list(rounds[i]['Series_Amount'])))
        ax.hist(funding, color='green', alpha=0.2)
        ax.set_title(round_title[i])

# Visualizes the residuals of the funding rounds
def plot_residuals(errors):
    fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(7,7))
    fig.tight_layout()
    axes.hist(errors, color='green', alpha=0.2)
    axes.set_title('Residual Analysis')

In [None]:
 create_log_plot(regress_train_A, regress_train_B, regress_train_C, regress_train_D)

In [None]:
features = list(regress_train_D.columns[2:])
features.remove('Market')
features.remove('Series_Amount')

X_train = np.asmatrix(regress_train[features].astype(np.float64))
X_test = np.asmatrix(regress_test[features].astype(np.float64))
y_train = np.asmatrix(regress_train['Series_Amount'].apply(lambda val: np.log(val))).T
y_test = np.asmatrix(regress_test['Series_Amount'].apply(lambda val: np.log(val))).T

# Ridge Regression

In [None]:
vdict = {}
rdict = {}
kf = KFold()
for a in [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]:
    for cv_train, cv_test in kf.split(y_train):
        lin_model_r = Ridge(alpha=a)
        lin_model_r.fit(X_train[cv_train], y_train[cv_train])
        y_pred = np.asmatrix(lin_model_r.predict(X_train[cv_test])).T
        mse = (mean_squared_error(y_train[cv_test], y_pred.T)**(0.5))
        vdict[a] = mse
        rdict[a] = lin_model_r
        
best_a = min(vdict, key=vdict.get)
best_ridge_model =rdict[best_a]

print('Cross validation MSE values: ' + str(vdict))
print('Best a from cross validation: ' + str(best_a))

In [None]:
y_pred = np.asmatrix(best_ridge_model.predict(X_test)).T
mse = (mean_squared_error(y_test, y_pred.T)**(0.5))
baseline_mean = np.log(np.asmatrix(np.mean(regress_test['Series_Amount']) * np.ones(y_test.shape[0]))).T

print('RMSE for Ridge Regression: ' + str(mse))
print('Baseline by predicting averages RMSE: ' + str(mean_squared_error(y_test, baseline_mean)**(0.5)))

In [None]:
plot_residuals(y_test-y_pred.T)
print('R Squared Score: ' + str(r2_score(y_test, y_pred.T)))

# Lasso Regression

In [None]:
vdict = {}
rdict = {}
kf = KFold()
for a in [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]:
    for cv_train, cv_test in kf.split(y_train):
        lin_model_r = Lasso(alpha=a)
        lin_model_r.fit(X_train[cv_train], y_train[cv_train])
        y_pred = np.asmatrix(lin_model_r.predict(X_train[cv_test])).T
        mse = (mean_squared_error(y_train[cv_test], y_pred)**(0.5))
        vdict[a] = mse
        rdict[a] = lin_model_r
        
best_a = min(vdict, key=vdict.get)
best_lasso_model =rdict[best_a]

print('Cross validation MSE values: ' + str(vdict))
print('Best a from cross validation: ' + str(best_a))

In [None]:
y_pred = np.asmatrix(best_lasso_model.predict(X_test)).T
mse = (mean_squared_error(y_test, y_pred)**(0.5))
baseline_mean = np.log(np.asmatrix(np.mean(regress_test['Series_Amount']) * np.ones(y_test.shape[0]))).T

print('RMSE for Lasso Regression: ' + str(mse))
print('Baseline by predicting averages RMSE: ' + str(mean_squared_error(y_test, baseline_mean)**(0.5)))

In [None]:
plot_residuals(y_test-y_pred)
print('R Squared Score: ' + str(r2_score(y_test, y_pred)))