In [45]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer
from sklearn.preprocessing import RobustScaler

from sklearn.model_selection import train_test_split

from sklearn.metrics import r2_score, mean_absolute_error

from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, ExtraTreesRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [46]:
demo_df = pd.read_csv(r"Dummy Data HSS.csv")

In [47]:
demo_df.head()

Unnamed: 0,TV,Radio,Social Media,Influencer,Sales
0,16.0,6.566231,2.907983,Mega,54.732757
1,13.0,9.237765,2.409567,Mega,46.677897
2,41.0,15.886446,2.91341,Mega,150.177829
3,83.0,30.020028,6.922304,Mega,298.24634
4,15.0,8.437408,1.405998,Micro,56.594181


In [48]:
demo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4572 entries, 0 to 4571
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   TV            4562 non-null   float64
 1   Radio         4568 non-null   float64
 2   Social Media  4566 non-null   float64
 3   Influencer    4572 non-null   object 
 4   Sales         4566 non-null   float64
dtypes: float64(4), object(1)
memory usage: 178.7+ KB


In [49]:
def impute(data):
    #checking missing values
    percent_missing = data.isnull().sum() * 100 / data.shape[0]
    #dropping columns if missing percentage is more than 30
    for i in range(len(data.columns)):
        if percent_missing[i] >30:
            data.drop(data.columns[i],axis=1,inplace=True)
    #getting numerical and categorical variables
    numerical_columns = [x for x in data.columns if data[x].dtype != 'object']
    data_num = data[numerical_columns]
    
    cat_columns = [x for x in data.columns if x not in numerical_columns]
    data_cat = data[cat_columns]
    
    #Imputing using KNN Imputer for numerical columns
    imputer = KNNImputer(n_neighbors=2)
    imputed_num = imputer.fit_transform(data_num)
    imputed_num = pd.DataFrame(imputed_num)
    imputed_num.columns=data_num.columns
    
    # most frequent imputation for categorical columns
    data_cat_imputed = data_cat.apply(lambda x: x.fillna(x.value_counts().index[0]))
    
    #concat the imputed dfs
    imputed_data = pd.concat([imputed_num, data_cat_imputed], axis=1)
    
    #return imputed_data
    return imputed_data

In [50]:
ll = impute(demo_df)

In [51]:
ll.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4572 entries, 0 to 4571
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   TV            4572 non-null   float64
 1   Radio         4572 non-null   float64
 2   Social Media  4572 non-null   float64
 3   Sales         4572 non-null   float64
 4   Influencer    4572 non-null   object 
dtypes: float64(4), object(1)
memory usage: 178.7+ KB


In [52]:
def EDA(imputed_data,y):
    #pairplot for all attributes
    sns.pairplot(imputed_data, hue=y)
    
    if 
    sns.heatmap(imputed_data)



SyntaxError: ignored

In [53]:
EDA(ll, 'Sales')

NameError: ignored

In [55]:
def normalize_and_encode(imputed_data):
    #normalizing numerical columns using robustscalar
    numerical_columns  = [x for x in imputed_data.columns if imputed_data[x].dtype in ['int64', 'float64']]
    scalar = RobustScaler(quantile_range=(25,75))
    scaled = scalar.fit_transform(imputed_data[numerical_columns])
    scaled = pd.DataFrame(scaled)
    scaled.columns = imputed_data[numerical_columns].columns
    
    #dropping cat columns with more than 10 categories
    cat_cols = [x for x in imputed_data.columns if x not in numerical_columns]
    cat_cols_to_drop = []
    for col in cat_cols:
        if imputed_data[col].value_counts().count()>10:
            cat_cols_to_drop.append(col)
    data_for_enc = imputed_data.drop(numerical_columns,axis=1)
    data_for_enc.drop(cat_cols_to_drop,axis=1,inplace=True)

    #encoding categorical varialbles
    enc_data= pd.get_dummies(data_for_enc, columns=data_for_enc.columns)
    
    encoded_data = pd.concat([scaled, enc_data], axis=1)

    return encoded_data

In [56]:
demo_df.head()

Unnamed: 0,TV,Radio,Social Media,Influencer,Sales
0,16.0,6.566231,2.907983,Mega,54.732757
1,13.0,9.237765,2.409567,Mega,46.677897
2,41.0,15.886446,2.91341,Mega,150.177829
3,83.0,30.020028,6.922304,Mega,298.24634
4,15.0,8.437408,1.405998,Micro,56.594181


In [57]:
train = normalize_and_encode(ll)
train.head()

Unnamed: 0,TV,Radio,Social Media,Sales,Influencer_Macro,Influencer_Mega,Influencer_Micro,Influencer_Nano
0,-0.822222,-0.746141,-0.044994,-0.839357,0,1,0,0
1,-0.888889,-0.569634,-0.196947,-0.889624,0,1,0,0
2,-0.266667,-0.130359,-0.043339,-0.243718,0,1,0,0
3,0.666667,0.803439,1.178863,0.680325,0,1,0,0
4,-0.844444,-0.622513,-0.502908,-0.82774,0,0,1,0


In [58]:
reg_models = [
    KNeighborsRegressor(),
    GradientBoostingRegressor(),
    KNeighborsRegressor(),
    ExtraTreesRegressor(),
    RandomForestRegressor(),
    DecisionTreeRegressor(),
    LinearRegression(),
    Lasso(),
    Ridge()
]

In [59]:
from sklearn.model_selection import RandomizedSearchCV


In [75]:
def training(train_data, y, reg_models):
  y_class = train_data[[y]]
  
  X_train, X_val, y_train, y_val = train_test_split(train_data.drop(y, axis=1), y_class, test_size=0.2, random_state=100)
  
  res = {}
  linear_grid = {
    'fit_intercept': [True, False]
  }
  lasso_grid = {
      'alpha': [0.1, 0.2, 0.5]
  }
  params = { 
      'LinearRegression': linear_grid, 
      'Lasso': lasso_grid
    }

  for reg in reg_models:
    name = reg.__class__.__name__  
    try:
      clf = RandomizedSearchCV(reg, params[name], random_state=0)
    except:
      print(name)
      continue
    results = clf.fit(X_train, y_train)
    print(results.best_params_)
    r2 = round(r2_score(y_val, clf.predict(X_val)), 3)
    mae = round(mean_absolute_error(y_val, clf.predict(X_val)), 3)
    
    print("{} trained with an MAE of : {} and an accuracy of: {}".format(name, mae, r2))
    
    res[name] = (mae,r2)
    
  return res

In [76]:
# def training(train_data, y, reg_models):

#   y_class = train_data[[y]]
#   train_data.drop(y, axis=1, inplace=True)
  
#   X_train, X_val, y_train, y_val = train_test_split(train_data, y_class, test_size=0.2, random_state=100)
  
#   res = {}
  
#   for reg in reg_models:
#     name = reg.__class__.__name__  
#     reg.fit(X_train, y_train)
    
#     r2 = round(r2_score(y_val, reg.predict(X_val)), 3)
#     mae = round(mean_absolute_error(y_val, reg.predict(X_val)), 3)
    
#     print("{} trained with an MAE of : {} and an accuracy of: {}".format(name, mae, r2))
    
#     res[name] = (mae,r2)
    
#   return res

In [77]:
training(train, 'Sales', reg_models=reg_models)

KNeighborsRegressor
GradientBoostingRegressor
KNeighborsRegressor
ExtraTreesRegressor
RandomForestRegressor
DecisionTreeRegressor




{'fit_intercept': True}
LinearRegression trained with an MAE of : 0.015 and an accuracy of: 0.999




{'alpha': 0.1}
Lasso trained with an MAE of : 0.147 and an accuracy of: 0.912
Ridge


{'LinearRegression': (0.015, 0.999), 'Lasso': (0.147, 0.912)}