In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
%cd '/content/gdrive/MyDrive/ML_automation/'

[Errno 2] No such file or directory: '/content/gdrive/MyDrive/ML_automation/'
/content


In [3]:
from sklearn import datasets
import pandas as pd
iris = datasets.load_iris()
df=pd.DataFrame(iris['data'])

In [4]:
df.to

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [None]:
import sqlite3
import os
import pickle
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
import numpy
warnings.filterwarnings('ignore')
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from mlxtend.preprocessing import minmax_scaling
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn import neighbors
import math
import xgboost as xg
from sklearn.svm import SVR

# 1. LOADING

# 1.1
def read_data_from_db(sqlite3_query,cur_execute_query,table_query):
  '''
    Read data from database and pull to console as DataFrame

    Input Parameters: 
                    a. sqlite3_query
                    b. cur_execute_query
                    c. table_query
                
    Returns: DataFrame
  '''
  # Create a SQL connection to our SQLite database
  con = sqlite3.connect(sqlite3_query)
  cur = con.cursor()
  # Be sure to close the connection
  cur.execute(cur_execute_query)
  print(cur.fetchall())
  df = pd.read_sql_query(table_query, con) 
  cur.close()
  con.close()
  return df

# 1.2

def read_data(csv_file_path):
  '''
    Read CSV file 

    Input Parameters: File Path (.csv)
                
    Returns: DataFrame
  '''
  df = pd.read_csv(csv_file_path)
  return df

# 2. PREPROCESSING

# 2.1

def data_type_correction(df):
  ''' 
    Performs auto correction of data type of each column 
  
    Input Parameters: DataFrame
                
    Returns: DataFrame
  '''
  num_values=df.select_dtypes(['float64','int64','int16','float16','float64','int64']).columns
  cat_values=[i for i in df.columns if i not in num_values]
  df[cat_values]=df[cat_values].astype('str')
  return df

# 2.2
def auto_remove_unwanted_columns(df):
  ''' 
    Performs auto removal of columns which are not useful for model training 
  
    Input Parameters: DataFrame
                
    Returns: DataFrame
  '''
  num_values=df.select_dtypes(['float64','int64','int16','float16','float64','int64']).columns
  cat_values=[i for i in df.columns if i not in num_values]
  for i in cat_values:
    if int(df[i].value_counts()) == 1 or int(df[i].value_counts()) == len(df[i]):
      df=df.drop(i,axis=1)
  return df

# 2.3
def auto_imputer(df):
  ''' 
    Imputation of empty enteries by central tendency: Mean and Mode  
  
    Input Parameters: DataFrame
                
    Returns: DataFrame
  '''
  num_values=df.select_dtypes(['int16', 'int32', 'int64', 'float16', 'float32', 'float64']).columns
  cat_values=[i for i in df.columns if i not in num_values]
  for column in cat_values:
    df[column].fillna((df[column].mode()), inplace=True)
  for column in num_values:
    df[column].fillna((df[column].mean()), inplace=True)
  return df

# 2.4
def remove_correlated_columns(df,threshold):
  ''' 
    Auto Removal of hight co-related columns based on user threshold. 
    Input Parameters: 
                    a. DataFrame
                    b. Threshold: numeric attribute
                    
    Returns: DataFrame
  '''
  # Create correlation matrix
  corr_matrix = df.corr().abs()
  # Select upper triangle of correlation matrix
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
  # Find features with correlation greater than threshold
  to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
  # Drop features 
  df.drop(to_drop, axis=1, inplace=True)
  return df

# 2.5
def target_label_encoder(df,target_variable,encoding_type): 
  ''' 
    Encodes categorical attributes to numerical attributes
    Input Parameters: 
                     a. DataFrame
                     b. Target Variable: 'target_variable'
                     c. Encoding Type : 'label' , 'onehot' 
                
    Returns: DataFrame
  '''
  le = preprocessing.LabelEncoder()
  target=df[target_variable]
  df=df.drop(target_variable,axis=1)
  num_values=df.select_dtypes(['float64','int64','int16','float16','float64','int64']).columns 
  cat_values=[i for i in df.columns if i not in num_values]
  for column in cat_values:
    if encoding_type=='label':        
      le.fit(list(df[column]))
      df[column]=le.transform(list(df[column]))
    elif encoding_type=='onehot':
      dummies = pd.get_dummies(df[column], prefix=column, drop_first=False)
      df = pd.concat([df, dummies], axis=1)
      df=df.drop([column],axis=1)
  df[target_variable]=target
  return df

# 2.6
def standard_scale(df,target_variable):
  ''' 
    Normalization of numeric type columns based on standard scaler method.
    Input Parameters: 
                    a. DataFrame
                    b. Target Variable: 'target_variable'
                    
    Returns: DataFrame
  '''
  num_values=df.select_dtypes(['float64','int64','int16','float16','float64','int64']).columns
  try:
    num_values=num_values.drop(target_variable)
  except:
    pass
  scaler = StandardScaler()
  scaler.fit(df[num_values])
  df[num_values]=scaler.transform(df[num_values])
  return df

#2.7
def minmax_scale(df,target_variable):
  ''' 
    Normalization of numeric type columns based on min-max scaler method.
    Input Parameters: 
                    a. DataFrame
                    b. Target Variable: 'target_variable'
                    
    Returns: DataFrame
  '''
  num_values=df.select_dtypes(['float64','int64','int16','float16','float64','int64']).columns
  try:
    num_values=num_values.drop(target_variable)
  except:
    pass
  scaled_data1 = minmax_scaling(df,columns=num_values)
  for column in num_values:
    df[column] = scaled_data1[column]
  return df

#3. FUNCTION USED IN MODELLING 
# NEEDED 
def train_and_test_split(df,target_variable):
  ''' 
    Splitting Data to two sets: Train and Test

    Input Parameters: 
                     a. DataFrame
                     b. Target Variable: 'target_variable'
                    
    Returns: train dataset, test dataset, train target, test target
  '''
  columns = list(df.columns)
  columns.remove(target_variable)
  X = df[columns]
  Y = df[target_variable]
  X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2,random_state=1)
  return X_train, X_test, y_train, y_test


#4. CLASSIFICATION MODELLING

# 4.1

# def logistic_regression_model(df,target_variable,output_file_name):
#   ''' 
#     Create and train machine learning model for prediction of independent variable
#     based on dependent variables in the dataframe. 
#     Input Parameters: 
#                     a. DataFrame
#                     b. Target Variable: 'target_variable'
#                     c. Output File Name: 'output_file_name' or None (If don't want to save)
                    
#     Returns: 
#            Display Results:
#                    a. Confusion Matrix
#                    b. Classification Report
#                    c. Accuracy
#   '''
#   X_train, X_test, y_train, y_test=train_and_test_split(df,target_variable)
#   if len(set(y_train)) == 2:
#     model = LogisticRegression(solver='liblinear', random_state=0)
#   else:
#     model = LogisticRegression(multi_class = 'multinomial', random_state=0)
#   model=model.fit(X_train,y_train)
#   test_pred=model.predict(X_test)
#   confusion_matrix_result=confusion_matrix(y_test,test_pred)
#   classification_report_result=classification_report(y_test,test_pred)
#   accuracy_score= metrics.accuracy_score(y_test,test_pred)* 100 
#   if output_file_name != None:
#     pickle.dump(model, open(output_file_name + '.sav', 'wb'))
#     print('Model Weights:', output_file_name + '.sav', ', got saved to location:',os.getcwd() + '/' + output_file_name + '.sav')

#   return confusion_matrix_result,classification_report_result,accuracy_score

# 4.2
def Support_Vector_classifier(df,target_variable,output_file_name):
  ''' 
    Create and train machine learning model for prediction of independent variable
    based on dependent variables in the dataframe. 
    Input Parameters: 
                    a. DataFrame
                    b. Target Variable: 'target_variable'
                    c. Output File Name: 'output_file_name' or None (If don't want to save)

                    
    Returns: 
           Display Results:
                   a. Confusion Matrix
                   b. Classification Report
                   c. Accuracy
  '''
  X_train, X_test, y_train, y_test=train_and_test_split(df,target_variable)
  model = svm.SVC()
  model=model.fit(X_train,y_train)
  test_pred=model.predict(X_test)
  confusion_matrix_result=confusion_matrix(y_test,test_pred)
  classification_report_result=classification_report(y_test,test_pred)
  accuracy_score= metrics.accuracy_score(y_test,test_pred)* 100 
  if output_file_name != None:
    pickle.dump(model, open(output_file_name + '.sav', 'wb'))
    print('Model Weights:', output_file_name + '.sav', ', got saved to location:',os.getcwd() + '/' + output_file_name + '.sav')

  return confusion_matrix_result,classification_report_result,accuracy_score


# 4.3
def Decision_Tree_Classifier(df,target_variable,output_file_name):
  ''' 
    Create and train machine learning model for prediction of independent variable
    based on dependent variables in the dataframe. 
    Input Parameters: 
                    a. DataFrame
                    b. Target Variable: 'target_variable'
                    c. Output File Name: 'output_file_name' or None (If don't want to save)
                    
    Returns: 
           Display Results:
                   a. Confusion Matrix
                   b. Classification Report
                   c. Accuracy
  '''
  X_train, X_test, y_train, y_test=train_and_test_split(df,target_variable)
  model = DecisionTreeClassifier(random_state=0)
  model=model.fit(X_train,y_train)
  test_pred=model.predict(X_test)
  confusion_matrix_result=confusion_matrix(y_test,test_pred)
  classification_report_result=classification_report(y_test,test_pred)
  accuracy_score= metrics.accuracy_score(y_test,test_pred)* 100 
  if output_file_name != None:
    pickle.dump(model, open(output_file_name + '.sav', 'wb'))
    print('Model Weights:', output_file_name + '.sav', ', got saved to location:',os.getcwd() + '/' + output_file_name + '.sav')

  return confusion_matrix_result,classification_report_result,accuracy_score
  
# 4.4  
def RandomForest_Classifier(df,target_variable,output_file_name):
  ''' 
    Create and train machine learning model for prediction of independent variable
    based on dependent variables in the dataframe. 
    Input Parameters: 
                    a. DataFrame
                    b. Target Variable: 'target_variable'
                    c. Output File Name: 'output_file_name' or None (If don't want to save)
                    
    Returns: 
           Display Results:
                   a. Confusion Matrix
                   b. Classification Report
                   c. Accuracy
  '''
  X_train, X_test, y_train, y_test=train_and_test_split(df,target_variable)
  model = RandomForestClassifier( random_state=0)
  model=model.fit(X_train,y_train)
  test_pred=model.predict(X_test)
  confusion_matrix_result=confusion_matrix(y_test,test_pred)
  classification_report_result=classification_report(y_test,test_pred)
  accuracy_score= metrics.accuracy_score(y_test,test_pred)* 100 
  if output_file_name != None:
    pickle.dump(model, open(output_file_name + '.sav', 'wb'))
    print('Model Weights:', output_file_name + '.sav', ', got saved to location:',os.getcwd() + '/' + output_file_name + '.sav')

  return confusion_matrix_result,classification_report_result,accuracy_score

# 4.5
def XGB_Classifier(df,target_variable,output_file_name):
  ''' 
    Create and train machine learning model for prediction of independent variable
    based on dependent variables in the dataframe. 
    Input Parameters: 
                    a. DataFrame
                    b. Target Variable: 'target_variable'
                    c. Output File Name: 'output_file_name' or None (If don't want to save)
                    
    Returns: 
           Display Results:
                   a. Confusion Matrix
                   b. Classification Report
                   c. Accuracy
  '''
  X_train, X_test, y_train, y_test=train_and_test_split(df,target_variable)
  model = XGBClassifier()
  model=model.fit(X_train,y_train)
  test_pred=model.predict(X_test.values)
  confusion_matrix_result=confusion_matrix(y_test,test_pred)
  classification_report_result=classification_report(y_test,test_pred)
  accuracy_score= metrics.accuracy_score(y_test,test_pred)* 100 
  if output_file_name != None:
    pickle.dump(model, open(output_file_name + '.sav', 'wb'))
    print('Model Weights:', output_file_name + '.sav', ', got saved to location:',os.getcwd() + '/' + output_file_name + '.sav')

  return confusion_matrix_result,classification_report_result,accuracy_score


#5. REGRESSION MODELLING

# 5.1
def Linear_Regression(df,target_variable,output_file_name):
  ''' 
    Create and train machine learning model for prediction of independent variable
    based on dependent variables in the dataframe. 
    Input Parameters: 
                    a. DataFrame
                    b. Target Variable: 'target_variable'
                    c. Output File Name: 'output_file_name' or None (If don't want to save)
                    
    Returns: 
            Returns root mean squared error value for model evaluation  
                  
  '''
  X_train, X_test, y_train, y_test=train_and_test_split(df,target_variable)
  model = LinearRegression()
  model=model.fit(X_train,y_train)
  test_pred=model.predict(X_test)
  rmse_score=math.sqrt(mean_squared_error(y_test,test_pred))
  if output_file_name != None:
    pickle.dump(model, open(output_file_name + '.sav', 'wb'))
    print('Model Weights:', output_file_name + '.sav', ', got saved to location:',os.getcwd() + '/' + output_file_name + '.sav')
  return rmse_score

# 5.2
def Support_Vector_Regressor(df,target_variable,output_file_name):
  ''' 
    Create and train machine learning model for prediction of independent variable
    based on dependent variables in the dataframe. 
    Input Parameters: 
                    a. DataFrame
                    b. Target Variable: 'target_variable'
                    c. Output File Name: 'output_file_name' or None (If don't want to save)
                    
    Returns: 
            Returns root mean squared error value for model evaluation  
                  
  '''
  X_train, X_test, y_train, y_test=train_and_test_split(df,target_variable)
  model = SVR()
  model=model.fit(X_train,y_train)
  test_pred=model.predict(X_test)
  rmse_score=math.sqrt(mean_squared_error(y_test,test_pred))

  if output_file_name != None:
    pickle.dump(model, open(output_file_name + '.sav', 'wb'))
    print('Model Weights:', output_file_name + '.sav', ', got saved to location:',os.getcwd() + '/' + output_file_name + '.sav')
  return rmse_score

# 5.3
def Decision_Tree_Regressor(df,target_variable,output_file_name):
  ''' 
    Create and train machine learning model for prediction of independent variable
    based on dependent variables in the dataframe. 
    Input Parameters: 
                    a. DataFrame
                    b. Target Variable: 'target_variable'
                    c. Output File Name: 'output_file_name' or None (If don't want to save)
                    
    Returns: 
            Returns root mean squared error value for model evaluation  
                  
  '''
  X_train, X_test, y_train, y_test=train_and_test_split(df,target_variable)
  model = DecisionTreeRegressor(random_state=0)
  model=model.fit(X_train,y_train)
  test_pred=model.predict(X_test)
  rmse_score=math.sqrt(mean_squared_error(y_test,test_pred))

  if output_file_name != None:
    pickle.dump(model, open(output_file_name + '.sav', 'wb'))
    print('Model Weights:', output_file_name + '.sav', ', got saved to location:',os.getcwd() + '/' + output_file_name + '.sav')
  return rmse_score 

# 5.4
def RandomForest_Regressor(df,target_variable,output_file_name):
  ''' 
    Create and train machine learning model for prediction of independent variable
    based on dependent variables in the dataframe. 
    Input Parameters: 
                    a. DataFrame
                    b. Target Variable: 'target_variable'
                    c. Output File Name: 'output_file_name' or None (If don't want to save)
                    
    Returns: 
            Returns root mean squared error value for model evaluation  
                  
  '''
  X_train, X_test, y_train, y_test=train_and_test_split(df,target_variable)
  model = RandomForestRegressor( random_state=0)
  model=model.fit(X_train,y_train)
  test_pred=model.predict(X_test)
  rmse_score=math.sqrt(mean_squared_error(y_test,test_pred))

  if output_file_name != None:
    pickle.dump(model, open(output_file_name + '.sav', 'wb'))
    print('Model Weights:', output_file_name + '.sav', ', got saved to location:',os.getcwd() + '/' + output_file_name + '.sav')
  return rmse_score

#5.5
def XGB_Regressor(df,target_variable,output_file_name):
  ''' 
    Create and train machine learning model for prediction of independent variable
    based on dependent variables in the dataframe. 
    Input Parameters: 
                    a. DataFrame
                    b. Target Variable: 'target_variable'
                    c. Output File Name: 'output_file_name' or None (If don't want to save)
                    
    Returns: 
            Returns root mean squared error value for model evaluation  
                  
  '''
  X_train, X_test, y_train, y_test=train_and_test_split(df,target_variable)
  model = xg.XGBRegressor(objective ='reg:linear')
  model=model.fit(X_train.values,y_train.values)
  test_pred=model.predict(X_test.values)
  rmse_score=math.sqrt(mean_squared_error(y_test,test_pred))

  if output_file_name != None:
    pickle.dump(model, open(output_file_name + '.sav', 'wb'))
    print('Model Weights:', output_file_name + '.sav', ', got saved to location:',os.getcwd() + '/' + output_file_name + '.sav')
  return rmse_score

#6. PREDICTION

def predict(model_weights,input_array):
  ''' 
    Prediction of output based on user input provided to model
    Input Parameters: 
                    a. Model Weights: saved model weight file (.sav pickel file )
                    b. Input Array: input array ( dependent feature attributes)
                       example: [1,2,1,2]
                    
    Returns: 
           Returns prediction results  
                  
  '''
  loaded_model = pickle.load(open(model_weights, 'rb'))
  predicted=loaded_model.predict([input_array])
  return predicted

In [None]:
# df['randNumCol'] = np.random.choice(['male','female'], df.shape[0])