In [2]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer
from sklearn.preprocessing import RobustScaler

from sklearn.model_selection import train_test_split

from sklearn.metrics import r2_score, mean_squared_error

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, ExtraTreesRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
import string
import random
# import firebase_admin
# from firebase_admin import credentials
# from firebase_admin import firestore
# import pickle


In [3]:
demo_df = pd.read_csv(r"Dummy Data HSS.csv")

In [5]:
from google.cloud import storage
import os

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'C:\sujan\git learning\git\bigdata\maximal-record-384001-406302dda581.json' 
def write_read(bucket_name, blob_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    blob.upload_from_filename('upload_imputed.csv')

In [6]:
def impute(data):
    #checking missing values
    percent_missing = data.isnull().sum() * 100 / data.shape[0]
    #dropping columns if missing percentage is more than 30
    for i in range(len(data.columns)):
        if percent_missing[i] >30:
            data.drop(data.columns[i],axis=1,inplace=True)
    #getting numerical and categorical variables
    numerical_columns = [x for x in data.columns if data[x].dtype != 'object']
    data_num = data[numerical_columns]
    
    cat_columns = [x for x in data.columns if x not in numerical_columns]
    data_cat = data[cat_columns]
    
    #Imputing using KNN Imputer for numerical columns
    imputer = KNNImputer(n_neighbors=2)
    imputed_num = imputer.fit_transform(data_num)
    imputed_num = pd.DataFrame(imputed_num)
    imputed_num.columns=data_num.columns
    
    # most frequent imputation for categorical columns
    data_cat_imputed = data_cat.apply(lambda x: x.fillna(x.value_counts().index[0]))
    
    #concat the imputed dfs
    imputed_data = pd.concat([imputed_num, data_cat_imputed], axis=1)
    imputed_data.to_csv('upload_imputed.csv',index = False)
    write_read('automl-bigdata', 'remove.csv')
    #return imputed_data
    return imputed_data

In [5]:
ll = impute(demo_df)

In [6]:
def normalize_and_encode(imputed_data):
    #normalizing numerical columns using robustscalar
    numerical_columns  = [x for x in imputed_data.columns if imputed_data[x].dtype in ['int64', 'float64']]
    scalar = RobustScaler(quantile_range=(25,75))
    scaled = scalar.fit_transform(imputed_data[numerical_columns])
    scaled = pd.DataFrame(scaled)
    scaled.columns = imputed_data[numerical_columns].columns
    
    #dropping cat columns with more than 10 categories
    cat_cols = [x for x in imputed_data.columns if x not in numerical_columns]
    cat_cols_to_drop = []
    for col in cat_cols:
        if imputed_data[col].value_counts().count()>10:
            cat_cols_to_drop.append(col)
    data_for_enc = imputed_data.drop(numerical_columns,axis=1)
    data_for_enc.drop(cat_cols_to_drop,axis=1,inplace=True)

    #encoding categorical varialbles
    enc_data= pd.get_dummies(data_for_enc, columns=data_for_enc.columns)
    
    encoded_data = pd.concat([scaled, enc_data], axis=1)

    return encoded_data

In [7]:
train = normalize_and_encode(ll)

In [40]:
reg_models = [
    KNeighborsRegressor(),
    GradientBoostingRegressor(),
    ExtraTreesRegressor(),
    RandomForestRegressor(),
    DecisionTreeRegressor(),
    LinearRegression(),
    Lasso(),
    Ridge()
]

In [71]:
from firebase_admin import firestore
cred = credentials.Certificate('/content/auto-ml-af39c-firebase-adminsdk-37cmd-35f3911f5e.json')
app = firebase_admin.initialize_app(cred)
dummy = firestore.client()

<google.cloud.firestore_v1.client.Client at 0x7f5c2175dd90>

In [74]:
def connection():
  cred = credentials.Certificate('/content/auto-ml-af39c-firebase-adminsdk-37cmd-35f3911f5e.json')
  try:
    app = firebase_admin.initialize_app(cred)
  except:
    app = firebase_admin.initialize_app(cred, name = str(random.random()))
  return firestore.client()

def training(train_data, y, reg_models):
  db = connection()  
  y_class = train_data[[y]]
  
  X_train, X_val, y_train, y_val = train_test_split(train_data.drop(y, axis=1), y_class, test_size=0.2, random_state=100)
  
  res = {}
  
  KNeighborsRegressor_grid = {
      'n_neighbors':[2,5,10], 
      'weights': ['uniform', 'distance'], 
      'algorithm': ['auto','ball_tree','kd_tree','brute'],
      'leaf_size': [15,30,45],
      }

  GradientBoostingRegressor_grid = {
      'loss':['squared_error', 'absolute_error', 'huber', 'quantile'],
      'learning_rate':[0.1,0.5,0.8],
      'n_estimators':[10,50,100]
  }

  ExtraTreesRegressor_grid = {
      'n_estimators':[10,50,100],
      'criterion':['squared_error', 'absolute_error', 'friedman_mse', 'poisson']
  }

  RandomForestRegressor_grid = {
      'n_estimators':[10,50,100],
      'criterion':['squared_error', 'absolute_error', 'friedman_mse', 'poisson']
  }

  DecisionTreeRegressor_grid = {
      'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
      'splitter':['best','random']
  }

  LinearRegression_grid = {
    'fit_intercept': [True, False]
  }

  Lasso_grid = {
      'alpha': [0.1, 0.2, 0.5],
      'fit_intercept': [True, False]
  }
  Ridge_grid = {
       'alpha': [0.1, 0.2, 0.5],
      'fit_intercept': [True, False]
  }
  
 
  params = { 
      'KNeighborsRegressor': KNeighborsRegressor_grid,
      'GradientBoostingRegressor': GradientBoostingRegressor_grid,
      'ExtraTreesRegressor': ExtraTreesRegressor_grid,
      'RandomForestRegressor': RandomForestRegressor_grid,
      'DecisionTreeRegressor': DecisionTreeRegressor_grid,
      'LinearRegression': LinearRegression_grid, 
      'Lasso': Lasso_grid,
      'Ridge':Ridge_grid
    }

  for reg in reg_models:
    name = reg.__class__.__name__  
    try:
      clf = RandomizedSearchCV(reg, params[name], random_state=0)
    except:
      print(name)
      continue
    results = clf.fit(X_train, y_train)
    print(results.best_params_)
    r2 = round(r2_score(y_val, clf.predict(X_val)), 3)
    rmse = round(mean_squared_error(y_val, clf.predict(X_val)), 3)
    N = 16
 
    string_name = ''.join(random.choices(string.ascii_uppercase + string.ascii_lowercase + string.digits, k = N))

    while string_name in db.collection(u'models').stream():
        string_name = ''.join(random.choices(string.ascii_uppercase + string.ascii_lowercase + string.digits, k = N))

    print("{} trained with an RMSE of : {} and an accuracy of: {}".format(name, rmse, r2))
    
    res[name] = {
        'RMSE': rmse,
         'r2': r2,
         'params': results.best_params_
      }  

  # Add a new doc in collection 'cities' with ID 'LA'
  db.collection(u'models').document(string_name).set(res)
  return res

In [75]:
training(train, 'Sales', reg_models=reg_models)

{'weights': 'distance', 'n_neighbors': 5, 'leaf_size': 30, 'algorithm': 'ball_tree'}
KNeighborsRegressor trained with an RMSE of : 0.003 and an accuracy of: 0.99
{'n_estimators': 100, 'loss': 'squared_error', 'learning_rate': 0.1}
GradientBoostingRegressor trained with an RMSE of : 0.0 and an accuracy of: 0.999
{'n_estimators': 50, 'criterion': 'absolute_error'}
ExtraTreesRegressor trained with an RMSE of : 0.0 and an accuracy of: 0.999
{'n_estimators': 50, 'criterion': 'absolute_error'}
RandomForestRegressor trained with an RMSE of : 0.0 and an accuracy of: 0.999
{'splitter': 'random', 'criterion': 'absolute_error'}
DecisionTreeRegressor trained with an RMSE of : 0.001 and an accuracy of: 0.998
{'fit_intercept': True}
LinearRegression trained with an RMSE of : 0.0 and an accuracy of: 0.999
{'fit_intercept': False, 'alpha': 0.1}
Lasso trained with an RMSE of : 0.029 and an accuracy of: 0.912
{'fit_intercept': False, 'alpha': 0.1}
Ridge trained with an RMSE of : 0.0 and an accuracy of: 

{'KNeighborsRegressor': {'RMSE': 0.003,
  'r2': 0.99,
  'params': {'weights': 'distance',
   'n_neighbors': 5,
   'leaf_size': 30,
   'algorithm': 'ball_tree'}},
 'GradientBoostingRegressor': {'RMSE': 0.0,
  'r2': 0.999,
  'params': {'n_estimators': 100,
   'loss': 'squared_error',
   'learning_rate': 0.1}},
 'ExtraTreesRegressor': {'RMSE': 0.0,
  'r2': 0.999,
  'params': {'n_estimators': 50, 'criterion': 'absolute_error'}},
 'RandomForestRegressor': {'RMSE': 0.0,
  'r2': 0.999,
  'params': {'n_estimators': 50, 'criterion': 'absolute_error'}},
 'DecisionTreeRegressor': {'RMSE': 0.001,
  'r2': 0.998,
  'params': {'splitter': 'random', 'criterion': 'absolute_error'}},
 'LinearRegression': {'RMSE': 0.0,
  'r2': 0.999,
  'params': {'fit_intercept': True}},
 'Lasso': {'RMSE': 0.029,
  'r2': 0.912,
  'params': {'fit_intercept': False, 'alpha': 0.1}},
 'Ridge': {'RMSE': 0.0,
  'r2': 0.999,
  'params': {'fit_intercept': False, 'alpha': 0.1}}}