In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets, linear_model
from scipy import stats as ss
import matplotlib.pyplot as plt
import matplotlib.cm as cm

In [2]:
filepath = '/Users/Aman/Desktop/Colin/'
df = pd.read_csv(filepath + 'website_metrics.txt')

In [3]:
Y = df['median_page_download_time']
X = df[['number_of_servers_contacted',
        'number_of_origins_contacted',
        'number_of_object_requests_median',
        'object_request_size_median',
        'number_of_javascript_objects_median',
        'number_of_javascript_objects_median',
        'size_of_javascript_objects_median',
        'number_of_image_objects_median',
        'size_of_image_objects_median',
        'number_of_flash_objects_median',
        'size_of_flash_objects_median',
        'number_of_css_objects_median',
        'size_of_css_objects_median',
        'number_of_unidentified_objects_median',
        'size_of_unidentified_objects_median'
       ]]



In [None]:
# 1. Basic Linear Regression.

In [5]:

def normalize(Y,X):
    X = np.array(X)
    Y = np.array(Y)
    Y = np.array(ss.zscore(Y)).T
    X = np.array([(ss.zscore(x)) for x in X.T]).T
    return Y,X

norm_Y, norm_X = normalize(Y,X)

def regression(Y,X):
    regr_model = linear_model.LinearRegression()
    regr_model.fit(X,Y)
    print('Coefficients: \n', regr_model.coef_)
    P = regr_model.predict(X)
    print("Residual sum of squares: %.2f"% np.mean((P - Y) ** 2))
    
    return regr_model,P

regr_model,Predicted = regression(norm_Y,norm_X)


Predicted = (Predicted*np.mean(df['median_page_download_time'])) + np.std(df['median_page_download_time'])

('Coefficients: \n', array([  1.68929833e-01,  -3.73717208e-02,   8.29434516e-01,
        -2.81168714e-02,   5.11458860e-02,   5.11458860e-02,
         6.54284587e-02,  -3.56110175e-01,   9.49090423e-02,
        -4.24582191e-02,   1.63842329e-04,  -3.43775893e-02,
         7.04444256e-02,  -9.43842873e-03,   9.10881623e-03]))
Residual sum of squares: 0.36


In [55]:
f, ax = plt.subplots(figsize=(10, 10))
ax.scatter(Y, Predicted, s=60,c='g')
ax.set_ylim(ymin=-1000)
ax.set_xlim(xmin=-1000)
ax.set_ylim(ymax=15000)
ax.set_xlim(xmax=15000)
plt.xlabel("Predicted Page Load Time(in miliseconds)")
plt.ylabel("Actual Page Load Time(in miliseconds)")
plt.title("Page Load Time: Actual vs Predicated")
ax.plot(ax.get_xlim(), ax.get_ylim(), ls="--", c=".3")
plt.show()

In [57]:

number_of_servers_contacted = np.array(ss.zscore(df.number_of_servers_contacted))
number_of_object_requests_median = np.array(ss.zscore(df.number_of_object_requests_median))
number_of_image_objects_median = np.array(ss.zscore(df.number_of_image_objects_median))

colors = iter(cm.rainbow(np.linspace(0, 1, 3)))
f1, ax1 = plt.subplots(figsize=(10, 10))
ax1.set_ylim(ymax=6)
ax1.set_xlim(xmax=6)
ax1.scatter(number_of_image_objects_median,norm_Y , s=75,color=next(colors), marker="p",edgecolor = 'black', label='Median Count of Only Images',alpha=0.9)
ax1.scatter(number_of_servers_contacted,norm_Y , s=75,color=next(colors), marker="o",edgecolor = 'black', label='Number Servers Contacted',alpha=0.9)
ax1.scatter(number_of_object_requests_median,norm_Y , s=75,color=next(colors), marker="s",edgecolor = 'black', label='Median Count of Object',alpha=0.9)


plt.legend(loc='upper left');
plt.colors()
plt.ylabel('Time in seconds')
plt.xlabel('Features(Normalized to make comparable)')
plt.grid(True)
plt.title("Page Load Time vs Features")
plt.show()