In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets, linear_model
from scipy import stats as ss
import matplotlib.pyplot as plt
import matplotlib.cm as cm

In [2]:
filepath = '/Users/Aman/Desktop/Colin/'
df = pd.read_csv(filepath + 'website_metrics.txt')

In [3]:
Y = df['median_page_download_time']
X = df[['number_of_servers_contacted',
        'number_of_object_requests_median',
        'object_request_size_median',
        'number_of_javascript_objects_median',
       'number_of_javascript_objects_median',
       'size_of_javascript_objects_median',
       'number_of_image_objects_median',
       'size_of_image_objects_median',
       'number_of_flash_objects_median',
       'size_of_flash_objects_median',
       'number_of_css_objects_median',
       'size_of_css_objects_median',
       ]]

# X = df[['number_of_object_requests_median']]

In [4]:
def normalize(Y,X):
    X = np.array(X)
    Y = np.array(Y)
    Y = np.array(ss.zscore(Y)).T
    X = np.array([(ss.zscore(x)) for x in X.T]).T
    return Y,X
    
def regression(Y,X):
    regr = linear_model.LinearRegression()
    Y,X = normalize(Y,X)
    regr.fit(X,Y)
    print('Coefficients: \n', regr.coef_)
    P = regr.predict(X)
    print("Residual sum of squares: %.2f"% np.mean((P - Y) ** 2))
    
    return regr,P

regr,Predicted = regression(Y,X)
# norm_Y = np.array(ss.zscore(Y)).T

Predicted = (Predicted*np.mean(df['median_page_download_time'])) + np.std(df['median_page_download_time'])

('Coefficients: \n', array([ 0.13167658,  0.79767041, -0.01362765,  0.05513603,  0.05513603,
        0.06343789, -0.33478202,  0.0886664 , -0.03954514, -0.00513715,
       -0.03324378,  0.06891919]))
Residual sum of squares: 0.36


In [5]:
f, ax = plt.subplots(figsize=(10, 10))
ax.scatter(Y, Predicted, s=60,c='g')
ax.set_ylim(ymin=-1000)
ax.set_xlim(xmin=-1000)
ax.set_ylim(ymax=15000)
ax.set_xlim(xmax=15000)
plt.xlabel("Predicted Page Load Time(in miliseconds)")
plt.ylabel("Actual Page Load Time(in miliseconds)")
plt.title("Page Load Time: Actual vs Predicated")
ax.plot(ax.get_xlim(), ax.get_ylim(), ls="--", c=".3")
plt.show()

In [15]:

number_of_servers_contacted = np.array(ss.zscore(df.number_of_servers_contacted))
number_of_object_requests_median = np.array(ss.zscore(df.number_of_object_requests_median))
object_request_size_median = np.array(ss.zscore(df.object_request_size_median))
norm_Y = np.array(ss.zscore(Y)).T

colors = iter(cm.rainbow(np.linspace(0, 1, 3)))
f1, ax1 = plt.subplots(figsize=(10, 10))
ax1.set_ylim(ymax=6)
ax1.set_xlim(xmax=6)
ax1.scatter(object_request_size_median,norm_Y , s=75,color=next(colors), marker="p",edgecolor = 'black', label='Median Length of Objects',alpha=0.9)
ax1.scatter(number_of_servers_contacted,norm_Y , s=75,color=next(colors), marker="o",edgecolor = 'black', label='Number Servers Contacted',alpha=0.9)
ax1.scatter(number_of_object_requests_median,norm_Y , s=75,color=next(colors), marker="s",edgecolor = 'black', label='Median Count of Object',alpha=0.9)


plt.legend(loc='upper left');
plt.colors()
plt.ylabel('Time in seconds')
plt.xlabel('Features(Normalized to make comparable)')
plt.grid(True)
plt.title("Page Load Time vs Features")
plt.show()