# Flatiron School Mod_7 project - Modelling Notebook

In [179]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from glob import glob
import notebook_toolkit as nt #external scripts for processing data



In [180]:
filenames = glob('pagespeed_csvs/pagespeed_results_*.csv')
dataframes = [pd.read_csv(f) for f in filenames]
pagespeed_df = pd.concat(dataframes)
pagespeed_df.columns = (['address',
                         'first_contentful_paint',
                         'time_to_interactive',
                         'time_to_first_byte',
                         'dom_size',
                         'boot_up_time',
                         'first_meaningful_paint',
                         'speed_index',
                         'total_blocking_time',
                         'network_requests',
                         'total_byte_weight'])
pagespeed_df.drop_duplicates(subset = 'address',keep='first', inplace = True)

In [181]:
#complete pipline; try and get the duplicate values removed and make sure we have a list of X and Y variables to process before putting into the pipeline

Unnamed: 0,address,first_contentful_paint,time_to_interactive,time_to_first_byte,dom_size,boot_up_time,first_meaningful_paint,speed_index,total_blocking_time,network_requests,total_byte_weight
0,https://www.asdafoundation.org/,1275,1410,609.853,137,8.471,1410,2129.675696,0.0,43.0,309579.0
1,https://www.americanstampdealer.com/,908,908,883.182,676,36.641,908,1870.638039,0.0,29.0,608118.0
2,https://www.asdagoodliving.co.uk/,1035,1193.5,768.089,1339,436.294,1165,2184.859293,18.5,54.0,1248103.0
3,http://www.asdatoday.com/,590,2785,358.059,326,282.68,656,1180.783589,25.5,45.0,5422097.0
4,https://downdetector.co.uk/status/asda/,708,4630,385.969,2195,1796.4,748,1771.028798,60.0,326.0,2833017.0
...,...,...,...,...,...,...,...,...,...,...,...
11080,https://www.crunchbase.com/person/stephen-curry,580,3963.5,590.61,1921,2567.22,1612,1637.243066,973.5,85.0,1832888.0
11082,https://www.sfchronicle.com/warriors/article/S...,1219,10640,238.884,1184,3168.11,1219,4341.616309,208.0,575.0,5890014.0
11085,https://www.cbssports.com/nba/players/playerpa...,953,9365.5,1590.51,2298,4762.25,953,4540.687065,625.5,457.0,6759435.0
11087,https://www.khou.com/article/news/health/coron...,712,15833.5,402.752,1077,9590.34,1077,6676.864039,667.5,1093.0,7843296.0


In [172]:
class CleanPageSpeedData(BaseEstimator,TransformerMixin):
    def __init__(self):
        return None
    def fit(self, X, y = None):
        return self
    def transform(self,X):        
        X.dropna(inplace = True)
        for index in range(1,10):
            X.iloc[:,index] = pagespeed_df.iloc[:,index].astype(np.float64)
        X.drop(labels = "address", axis = 1, inplace = True)
        return X

In [160]:
cleaner = CleanPageSpeedData()
cleaned_data = cleaner.transform(pagespeed_df)
y = cleaned_data[0]
X = cleaned_data[1]


In [167]:
clean_pipeline = Pipeline([('clean_data', CleanPageSpeedData()),
                           ('std_scaler', StandardScaler())])

In [168]:
clean_data = clean_pipeline.fit_transform(pagespeed_df)

In [169]:
clean_data

array([[ 6.02183101e-01, -7.80336140e-01, -1.70842395e-01, ...,
        -5.05350221e-01, -7.16671926e-01, -5.32641069e-01],
       [-8.72694839e-02, -9.36252528e-01,  1.61710517e-01, ...,
        -5.05350221e-01, -7.82997758e-01, -4.78259006e-01],
       [ 1.51314926e-01, -8.47578965e-01,  2.16795938e-02, ...,
        -4.78360256e-01, -6.64558773e-01, -3.61678912e-01],
       ...,
       [-2.73170099e-03,  1.69056591e+00,  1.02230094e+00, ...,
         4.07202355e-01,  1.24467767e+00,  6.42268995e-01],
       [-4.55478494e-01,  3.69946472e+00, -4.22817270e-01, ...,
         4.68476869e-01,  4.25776545e+00,  8.39705837e-01],
       [-1.88714823e-01,  2.58899392e-01,  1.40549070e+00, ...,
         5.86649145e-01, -4.98744194e-01,  5.09454172e-02]])