In [1]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer
from sklearn.preprocessing import RobustScaler

from sklearn.model_selection import train_test_split

from sklearn.metrics import r2_score, mean_squared_error

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, ExtraTreesRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import string
import random
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore
import pickle
from flask import Flask
from sklearn.metrics import balanced_accuracy_score, f1_score
from scipy import stats
import plotly.express as px


In [2]:
demo_df = pd.read_csv(r"train.csv")

In [3]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/content/automl-bigdata-7c2859c8477a.json' 

In [4]:
from google.cloud import storage
def write_read(bucket_name, blob_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    blob.upload_from_filename('/content/Dummy Data HSS.csv')

In [5]:
def impute(data):
    # Dashboard for raw data
    num_cols_raw = [x for x in data.columns if data[x].dtype in ['int64', 'float64']]
    numerical_cols_raw = len(num_cols_raw)
    cat_cols_raw = [x for x in data.columns if data[x].dtype in ['object']]
    categorical_columns_raw = len(cat_cols_raw)
    num_columns_raw = [x for x in data.columns if data[x].dtype]
    number_columns_raw = len(num_columns_raw)
    num_rows_raw = len(data.index)

    #checking missing values
    percent_missing = data.isnull().sum() * 100 / data.shape[0]
    #dropping columns if missing percentage is more than 30
    for i in range(len(data.columns)):
        if percent_missing[i] >30:
            data.drop(data.columns[i],axis=1,inplace=True)
    
    missing = [x for x in percent_missing if x > 0.0]
    missing_rows_raw = len(missing)

    rawdata_dash = {
        'Raw_numericalvalues': numerical_cols_raw,
        'Raw_categoricalvalues': categorical_columns_raw,
        'Raw_columns': number_columns_raw,
        'Raw_rows': num_rows_raw,
        'Raw_missing': missing_rows_raw
      } 

    #getting numerical and categorical variables
    numerical_columns = [x for x in data.columns if data[x].dtype != 'object']
    data_num = data[numerical_columns]
    
    cat_columns = [x for x in data.columns if x not in numerical_columns]
    data_cat = data[cat_columns]
    
    #Imputing using KNN Imputer for numerical columns
    imputer = KNNImputer(n_neighbors=2)
    imputed_num = imputer.fit_transform(data_num)
    imputed_num = pd.DataFrame(imputed_num)
    imputed_num.columns=data_num.columns
    
    # most frequent imputation for categorical columns
    data_cat_imputed = data_cat.apply(lambda x: x.fillna(x.value_counts().index[0]))
    
    #concat the imputed dfs
    imputed_data = pd.concat([imputed_num, data_cat_imputed], axis=1)

    # Dashboard for imputed data

    num_cols_imp = [x for x in imputed_data.columns if imputed_data[x].dtype in ['int64', 'float64']]
    numerical_cols_imp = len(num_cols_imp)
    cat_cols_imp = [x for x in imputed_data.columns if imputed_data[x].dtype in ['object']]
    categorical_columns_imp = len(cat_cols_imp)
    num_columns_imp = [x for x in imputed_data.columns if imputed_data[x].dtype]
    number_columns_imp = len(num_columns_imp)
    num_rows_imp = len(imputed_data.index)

    missing_imp = [x for x in percent_missing if x > 0.30]
    missing_rows_imp = len(missing_imp)
    
    impdata_dash = {
        'Imputed_numericalvalues': numerical_cols_imp,
        'Imputed_categoricalvalues': categorical_columns_imp,
        'Imputed_columns': number_columns_imp,
        'Imputed_rows': num_rows_imp,
        'Imputed_missingvalues': missing_rows_imp
      } 

    #return imputed_data
    return imputed_data,rawdata_dash, impdata_dash

In [6]:
ll = impute(demo_df)
ll

(            ID    hr  weathersit  temp   atemp   hum  windspeed  casual  \
 0       3094.0  20.0         2.0  0.52  0.5000  0.83     0.1343    22.0   
 1      10645.0  16.0         3.0  0.50  0.4848  0.94     0.2985    49.0   
 2       2114.0  23.0         1.0  0.32  0.3182  0.61     0.1642    20.0   
 3      15289.0   6.0         3.0  0.62  0.5455  0.94     0.0896     1.0   
 4       2273.0  14.0         2.0  0.36  0.3485  0.81     0.1343    94.0   
 ...        ...   ...         ...   ...     ...   ...        ...     ...   
 10943  11284.0   9.0         1.0  0.46  0.4545  0.88     0.0896    30.0   
 10944  11964.0  17.0         1.0  0.66  0.6212  0.34     0.1343   124.0   
 10945   5390.0  12.0         1.0  0.80  0.7273  0.43     0.2836    26.0   
 10946    860.0   7.0         1.0  0.24  0.1970  0.65     0.4179     3.0   
 10947  15795.0   8.0         2.0  0.52  0.5000  0.83     0.1642    33.0   
 
          cnt      dteday  
 0      152.0  2011-05-13  
 1      177.0  2012-03-24  
 2

In [7]:
def normalize_and_encode(imputed_data):
    #normalizing numerical columns using robustscalar
    numerical_columns  = [x for x in imputed_data.columns if imputed_data[x].dtype in ['int64', 'float64']]
    scalar = RobustScaler(quantile_range=(25,75))
    scaled = scalar.fit_transform(imputed_data[numerical_columns])
    scaled = pd.DataFrame(scaled)
    scaled.columns = imputed_data[numerical_columns].columns
    
    #dropping cat columns with more than 10 categories
    cat_cols = [x for x in imputed_data.columns if x not in numerical_columns]
    cat_cols_to_drop = []
    for col in cat_cols:
        if imputed_data[col].value_counts().count()>10:
            cat_cols_to_drop.append(col)
    data_for_enc = imputed_data.drop(numerical_columns,axis=1)
    data_for_enc.drop(cat_cols_to_drop,axis=1,inplace=True)

    #encoding categorical varialbles
    try:
        enc_data= pd.get_dummies(data_for_enc, columns=data_for_enc.columns)
        encoded_data = pd.concat([scaled, enc_data], axis=1)
    except:
      encoded_data = scaled.copy()

    return encoded_data

In [8]:
train = normalize_and_encode(ll[0])

In [9]:
classifiers = [
    XGBClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    LogisticRegression(),
    DecisionTreeClassifier()
    ]

In [10]:
reg_models = [
    KNeighborsRegressor(),
    LinearRegression(),
    GradientBoostingRegressor(),
    ExtraTreesRegressor(),
    RandomForestRegressor(),
    DecisionTreeRegressor(),
    Lasso(),
    Ridge()
]

In [11]:
# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = r'/content/automl-bigdata-514a5baac622.json' 
def cloud_access(bucket_name, blob_name, pickle_file):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    blob.upload_from_filename(pickle_file)

In [12]:
def connection():
  cred = credentials.Certificate('/content/auto-ml-af39c-firebase-adminsdk-37cmd-35f3911f5e.json')
  try:
    app = firebase_admin.initialize_app(cred)
  except:
    app = firebase_admin.initialize_app(cred, name = str(random.random()))
  return firestore.client()

def regression(train_data, y, reg_models):
  db = connection()  
  y_class = train_data[[y]]
  
  X_train, X_val, y_train, y_val = train_test_split(train_data.drop(y, axis=1), y_class, test_size=0.2, random_state=100)
  
  res = {}
  
  KNeighborsRegressor_grid = {
      'n_neighbors':[2,5,10], 
      'weights': ['uniform', 'distance'], 
      'algorithm': ['auto','ball_tree','kd_tree','brute'],
      'leaf_size': [15,30,45],
      }

  GradientBoostingRegressor_grid = {
      'loss':['squared_error', 'absolute_error', 'huber', 'quantile'],
      'learning_rate':[0.1,0.5,0.8],
      'n_estimators':[10,50,100]
  }

  ExtraTreesRegressor_grid = {
      'n_estimators':[10,50,100],
      'criterion':['squared_error', 'absolute_error', 'friedman_mse', 'poisson']
  }

  RandomForestRegressor_grid = {
      'n_estimators':[10,50,100],
      'criterion':['squared_error', 'absolute_error', 'friedman_mse', 'poisson']
  }

  DecisionTreeRegressor_grid = {
      'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
      'splitter':['best','random']
  }

  LinearRegression_grid = {
    'fit_intercept': [True, False]
  }

  Lasso_grid = {
      'alpha': [0.1, 0.2, 0.5],
      'fit_intercept': [True, False]
  }
  Ridge_grid = {
       'alpha': [0.1, 0.2, 0.5],
      'fit_intercept': [True, False]
  }
  
 
  params = { 
      'KNeighborsRegressor': KNeighborsRegressor_grid,
      'GradientBoostingRegressor': GradientBoostingRegressor_grid,
      'ExtraTreesRegressor': ExtraTreesRegressor_grid,
      'RandomForestRegressor': RandomForestRegressor_grid,
      'DecisionTreeRegressor': DecisionTreeRegressor_grid,
      'LinearRegression': LinearRegression_grid, 
      'Lasso': Lasso_grid,
      'Ridge':Ridge_grid
    }

  clf = {}

  for reg in reg_models:
    name = reg.__class__.__name__  
    try:
      clf[name] = RandomizedSearchCV(reg, params[name], random_state=0)
    except:
      print(name)
      continue
    results = clf[name].fit(X_train, y_train)
    print(results.best_params_)
    r2 = round(r2_score(y_val, clf[name].predict(X_val)), 3)
    rmse = round(mean_squared_error(y_val, clf[name].predict(X_val)), 3)
    N = 16
 
    string_name = ''.join(random.choices(string.ascii_uppercase + string.ascii_lowercase + string.digits, k = N))

    while string_name in db.collection(u'models').stream():
        string_name = ''.join(random.choices(string.ascii_uppercase + string.ascii_lowercase + string.digits, k = N))

    print("{} trained with an RMSE of : {} and an accuracy of: {}".format(name, rmse, r2))
    
    res[name] = {
        'RMSE': rmse,
         'r2': r2,
         'params': results.best_params_
      }  

  rmse_list = []
  r2_list = []
  names = list(res.keys())
  for name in res:
    rmse_list.append(res[name]['RMSE'])
    r2_list.append(res[name]['r2'])

  if rmse_list.count(min(rmse_list)) > 1:
    best_model = names[r2_list.index(max(r2_list))]
  else:
    best_model = names[rmse_list.index(min(rmse_list))]

  print(best_model, clf[best_model].get_params())
  pickle.dump(clf[best_model], open('model.pkl', 'wb'))
  cloud_access('automl-bigdataarch', 'regression_models/model.pkl', 'model.pkl')
  db.collection(u'models').document(string_name).set(res)
  return best_model

In [18]:
def connection():
  cred = credentials.Certificate('/content/auto-ml-af39c-firebase-adminsdk-37cmd-35f3911f5e.json')
  try:
    app = firebase_admin.initialize_app(cred)
  except:
    app = firebase_admin.initialize_app(cred, name = str(random.random()))
  return firestore.client()

def classification(train_data, y, classifiers):
  db = connection()  
  y_class = train_data[[y]]
  X_train, X_val, y_train, y_val = train_test_split(train_data.drop(y, axis=1), y_class, test_size=0.2, random_state=100)

  res = {}
  
  XGBClassifier_grid = {
      'n_estimators': stats.randint(50, 100),
      'learning_rate': stats.uniform(0.01, 0.59),
      'subsample': stats.uniform(0.3, 0.6),
      'max_depth': [3, 4, 5],
      'colsample_bytree': stats.uniform(0.5, 0.4),
      'min_child_weight': [1, 2, 3, 4]
      }

  RandomForestClassifier_grid = {
      'n_estimators':[10,50,100],
      'criterion':['gini', 'entropy', 'log_loss']
  }

  GradientBoostingClassifier_grid = {
      'loss':['log_loss', 'deviance', 'exponential'],
      'learning_rate':[0.1,0.5]
        }

  LogisticRegression_grid = {
    'penalty': ['l1', 'l2'],
    'dual':[True, False],
    'fit_intercept':[True,False]
  }

  DecisionTreeClassifier_grid = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter':['best', 'random']
  }
  
  params = { 
      'XGBClassifier': XGBClassifier_grid,
      'RandomForestClassifier': RandomForestClassifier_grid,
      'GradientBoostingClassifier': GradientBoostingClassifier_grid,
      'LogisticRegression': LogisticRegression_grid,
      'DecisionTreeClassifier':DecisionTreeClassifier_grid
    }
    
  clf = {}
  
  for clf1 in classifiers:
    name = clf1.__class__.__name__
    try:
      clf[name] = RandomizedSearchCV(clf1, params[name], random_state=0)
    except:
        print(name)
        continue 

    results = clf[name].fit(X_train, y_train)
    print(results.best_params_)        
    acc = round(balanced_accuracy_score(y_val, clf[name].predict(X_val)), 3)
    f1 = round(f1_score(y_true=y_val, y_pred = clf[name].predict(X_val), average='weighted'), 3)

    N = 16
 
    string_name = ''.join(random.choices(string.ascii_uppercase + string.ascii_lowercase + string.digits, k = N))

    while string_name in db.collection(u'models').stream():
        string_name = ''.join(random.choices(string.ascii_uppercase + string.ascii_lowercase + string.digits, k = N))

    print("{} trained with an F1 of : {} and an accuracy of: {}".format(name, f1, acc))

    res[name] = {
        'Accuracy': acc,
         'F1Score': f1,
         'params': results.best_params_
      }  

  acc_list = []
  f1_list = []
  names = list(res.keys())
  for name in res:
    acc_list.append(res[name]['Accuracy'])
    f1_list.append(res[name]['F1Score'])

  if acc_list.count(max(acc_list)) > 1:
    best_model = names[f1_list.index(max(f1_list))]
  else:
    best_model = names[acc_list.index(max(acc_list))]

  print(best_model, clf[best_model].get_params())
  pickle.dump(clf[best_model], open('model.pkl', 'wb'))
  cloud_access('automl-bigdataarch', 'classification_models/model.pkl', 'model.pkl')
  db.collection(u'models').document(string_name).set(res)
  return best_model

In [14]:
import warnings
warnings.filterwarnings("ignore")

In [17]:
regression(train,'cnt',reg_models)

{'weights': 'distance', 'n_neighbors': 10, 'leaf_size': 15, 'algorithm': 'brute'}
KNeighborsRegressor trained with an RMSE of : 0.164 and an accuracy of: 0.693
{'fit_intercept': True}
LinearRegression trained with an RMSE of : 0.244 and an accuracy of: 0.541
{'n_estimators': 100, 'loss': 'huber', 'learning_rate': 0.5}
GradientBoostingRegressor trained with an RMSE of : 0.078 and an accuracy of: 0.853
{'n_estimators': 100, 'criterion': 'squared_error'}
ExtraTreesRegressor trained with an RMSE of : 0.08 and an accuracy of: 0.849
{'n_estimators': 100, 'criterion': 'friedman_mse'}
RandomForestRegressor trained with an RMSE of : 0.081 and an accuracy of: 0.848
{'splitter': 'best', 'criterion': 'squared_error'}
DecisionTreeRegressor trained with an RMSE of : 0.156 and an accuracy of: 0.707
{'fit_intercept': False, 'alpha': 0.1}
Lasso trained with an RMSE of : 0.293 and an accuracy of: 0.45
{'fit_intercept': True, 'alpha': 0.5}
Ridge trained with an RMSE of : 0.244 and an accuracy of: 0.541
G

'GradientBoostingRegressor'

In [None]:
classification(train,'Survived',classifiers)