In [None]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.2.2-py2.py3-none-any.whl (80 kB)
[?25l[K     |████                            | 10 kB 20.6 MB/s eta 0:00:01[K     |████████▏                       | 20 kB 26.1 MB/s eta 0:00:01[K     |████████████▏                   | 30 kB 28.3 MB/s eta 0:00:01[K     |████████████████▎               | 40 kB 19.3 MB/s eta 0:00:01[K     |████████████████████▎           | 51 kB 8.7 MB/s eta 0:00:01[K     |████████████████████████▍       | 61 kB 10.0 MB/s eta 0:00:01[K     |████████████████████████████▍   | 71 kB 9.5 MB/s eta 0:00:01[K     |████████████████████████████████| 80 kB 5.3 MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.2.2


In [None]:
!pip install pywaffle

Collecting pywaffle
  Downloading pywaffle-0.6.3-py2.py3-none-any.whl (526 kB)
[?25l[K     |▋                               | 10 kB 21.6 MB/s eta 0:00:01[K     |█▎                              | 20 kB 26.2 MB/s eta 0:00:01[K     |█▉                              | 30 kB 27.4 MB/s eta 0:00:01[K     |██▌                             | 40 kB 19.0 MB/s eta 0:00:01[K     |███                             | 51 kB 8.7 MB/s eta 0:00:01[K     |███▊                            | 61 kB 10.0 MB/s eta 0:00:01[K     |████▍                           | 71 kB 9.5 MB/s eta 0:00:01[K     |█████                           | 81 kB 10.5 MB/s eta 0:00:01[K     |█████▋                          | 92 kB 10.6 MB/s eta 0:00:01[K     |██████▏                         | 102 kB 9.0 MB/s eta 0:00:01[K     |██████▉                         | 112 kB 9.0 MB/s eta 0:00:01[K     |███████▌                        | 122 kB 9.0 MB/s eta 0:00:01[K     |████████                        | 133 kB 9.0 MB/s eta 0

## **Importing Libraries**

In [None]:
import pandas as pd
import numpy as np
import json
import csv
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from dateutil.parser import parser

from category_encoders import TargetEncoder,OneHotEncoder,HashingEncoder

from category_encoders import TargetEncoder,OneHotEncoder,OrdinalEncoder,HashingEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score 
from sklearn.preprocessing import LabelEncoder

from pywaffle import Waffle

  import pandas.util.testing as tm


## **Reading Data**

In [None]:
def conv_to_df(data_path, *kwargs):

    file_format = data_path.split(".")[1]

    if file_format == "csv":
        return pd.read_csv(data_path)

    elif file_format == "xlsx":
        return pd.read_excel(data_path, index_col=0)

    elif file_format == "txt":
        fileHandle = open(data_path, 'r', encoding='utf-8-sig')

        rows = list(fileHandle)

        keys = [row.strip() for row in rows[0].split("|")]

        df_dict = {k: [] for k in keys}

        with open(data_path, 'r', encoding='utf-8-sig') as infh:
            reader = csv.reader(infh, delimiter='|')
            for ind, row in enumerate(reader):

                if ind == 0:
                    continue

                else:
                    for key, value in zip(df_dict.keys(), row):
                        df_dict[key].append(value)

        return pd.DataFrame(df_dict)

    elif file_format == "json":

        with open(data_path) as datafile:
            data = json.load(datafile)
            dataframe = pd.DataFrame(data)
        return dataframe
    
    elif file_format == "tsv":
        return pd.read_csv(data_path, sep="\t")

    else:
        resp = requests.get(data_path, params = params)

        if resp.status_code == 200:
            resp_dict = json.loads(resp.text)

            headers = ["trackName", "releaseDate", "collectionName", "trackPrice", "trackNumber"]

            df_dict = {k: [] for k in headers}

            for track in resp_dict["results"]:
                try:
                    df_dict["trackName"].append(track["trackName"])

                    df_dict["releaseDate"].append(track["releaseDate"])

                    df_dict["collectionName"].append(track["collectionName"])

                    df_dict["trackPrice"].append(track["trackPrice"])

                    df_dict["trackNumber"].append(track["trackNumber"])

                except KeyError:
                    continue

            date_parser = parser()

            releaseDates = [date_parser.parse(date).strftime("%m/%d/%Y, %H:%M:%S") for date in df_dict["releaseDate"]]

            df_dict["releaseDate"] = releaseDates

            df = pd.DataFrame.from_dict(df_dict) 

        return df   

In [None]:
def df_info(df, info, *kwargs):

    print("Shape of Dataset: {}\n".format(df.shape))

    print("Top {} Rows :\n {}\n".format(kwargs[0], df.head(kwargs[0])))

    print("DataFrame Data Types:\n {} \n".format(df.dtypes))

    print("DataFrame Information: \n{}\n".format(df.info))

    print("Null Values Per Column: \n{}\n".format(df.isnull().sum()))

    print("DataFrame Description: \n{}\n".format(df.describe()))

## **Feature Engineering**

In [None]:
#Converting Categorical values to Numerical values
def transform_categorical_data(train,validate,test,cat_dict,y):
  if "One-Hot Encoding" in cat_dict.keys():
    cols=cat_dict['One-Hot Encoding']
    OHE = OneHotEncoder(cols=cols)
    train = OHE.fit_transform(train)
    validate = OHE.transform(validate)
    test = OHE.transform(test) 

  if "Label Encoding" in cat_dict.keys():
    cols=cat_dict['Label Encoding']
    LE = LabelEncoder()
    for col in cols:
        train[col]=LE.fit_transform(train[col])
        validate[col]=LE.transform(validate[col])
        test[col]=LE.transform(test[col])
  
  if "Target Encoding" in cat_dict.keys():
    cols=cat_dict['Target Encoding']
    TE = TargetEncoder(cols=cols)
    TE.fit(train,y)
    train= TE.transform(train)
    validate=TE.transform(validate)
    test=TE.transform(test)

  if "Hash Encoding" in cat_dict.keys():
    cols=cat_dict['Hash Encoding']
    HE = HashingEncoder(cols=cols)
    train= HE.fit_transform(train)
    validate=HE.transform(validate)
    test=HE.transform(test)


  return train,test,validate

## **Preprocessing**

In [None]:
# In the Capping step, we convert the values above the max value to the max value
# And those below the min value to the min value

def detect_and_remove_outliers(df,type,col):
  if type == "Normal":
    h = df[col].mean()+3*df[col].std()
    l = df[col].mean()-3*df[col].std()
    df[col]= np.where(df[col]>h,h, np.where(df[col]<l,l,df[col])) #Capping step
  
  if type == "Skew":
    p25=df[col].quantile(0.25)
    p75=df[col].quantile(0.75)
    iqr=p75 - p25
    h = p75 + 1.5*iqr
    l = p25 - 1.5*iqr
    df[col]= np.where(df[col]>h,h, np.where(df[col]<l,l,df[col])) #Capping step

  return df

In [None]:
def transform_date(df,col):
  date_dict={"Day":[],"Month":[],"Year":[]}
  
  for val in col:
    val=str(val)
    date = val.split("/")
    date_dict["Day"].append(date[0])
    date_dict["Month"].append(date[1])
    date_dict["Year"].append(date[2])
  
  date_df=pd.DataFrame(date_dict,columns=["Day","Month","Year"])
  df=df.join(date_df)
  return df

## **Plotting**

In [None]:
def df_plots(df, plot_, *kwargs):
    if plot_ == "box":
        if not len(kwargs):
            df.boxplot(figsize=(20,20))
        else:
            df.boxplot(figsize=(20,20), column=kwargs[0])
    
    elif plot_ == "corr":

        plt.figure(figsize=(20, 20))

        heatmap = sns.heatmap(df.corr(), vmin=0, vmax=1, annot=True, cmap='Blues')
        heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':18});

        plt.show()

    elif plot_ == "waffle":
      col_vals = train[kwargs[0]].value_counts()
      fig = plt.figure(
          FigureClass = Waffle,
          rows = 5,
          columns = 10,
          values = col_vals,
          title={
              'label': 'Count of {}'.format(kwargs[0]),
              'loc':'center',
              'size':20
              },
              labels=["{}[{}]".format(i,v) for i,v in enumerate(col_vals)],)

In [None]:
def feature_plot(model,train,type):
  if type =="Linear Regression":
    coef=sorted(zip(model.coef_,train.columns),reverse=True)
    coef_df=pd.DataFrame(coef,columns=["Values","Features"])
    sns.barplot(x="Values",y="Features",data=coef_df)
  else:
    features=sorted(zip(model.feature_importances_,train.columns),reverse=True)
    features_df=pd.DataFrame(features,columns=["Values","Features"])
    sns.barplot(x="Values",y="Features",data=features_df)