In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Stats
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Text Preprocessing
import re
from nltk.corpus import stopwords
from string import punctuation

# Text Visualisation
from wordcloud import WordCloud
from nltk import FreqDist

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# LinReg
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Misc
import warnings 
warnings.filterwarnings("ignore")

from collections import Counter

### Utils Function

In [None]:
def update(df):
    """
    This function updates the all the dataframe. This function is rather specific. It helps when you want to do a global 
    update on all the dataframes.
    
    Parameters
    ----------
    df: pandas DataFrame
    This specifies the dataframe to be updated.
    
    Returns
    -------
    df: pandas DataFrame
    Updated main dataframe.
    
    object_df: pandas DataFrame
    Updated object dataframe.
    
    numeric_df: pandas DataFrame
    Updated numeric dataframe.
    
    Notes
    -----
    Please check the datatypes of the df before using this function to update the dataframe.
    """
    object_df = df.select_dtypes("object")
    numeric_df = df.select_dtypes(['int64', 'float64'])
    
    return object_df, numeric_df

In [None]:
def uniqueness(col, df):
    """
    This function prints the unique labels, occurances of unique labels and number of labels in the given categorical feature.
    
    Parameters
    ----------
    col: string-like
    This specifies the categorical feature.
    
    df: pandas DataFrame
    This specifies the dataframe.
    
    Returns
    -------
    None
    """
    unique = df[col].unique()
    count = df[col].value_counts()
    print("Unique Labels:", unique)
    print("Number of Labels:", len(unique))
    print("Occurances of Labels:\n", count, sep='')
    return None

In [None]:
# Build Function to Measure Skew and Kurtosis
def skewKurtosis(cols, df):
    """
    This function analysis the skew and kurtosis of the data.
    
    Parameters
    ----------
    cols: string-like or array-like
    This specifies the column(s) to be analysed.
    
    df: pandas DataFrame
    This specifies the DataFrame
    
    Returns
    -------
    None
    
    Skew
    ----
    Left-Skew / Right-Modal: Skew < 0 || Median > Mean
    Normal: Skew = 0 
    Right-Skew / Left-Modal: Skew > 0 || Median < Mean
    
    Kurtosis
    --------
    Platykurtic Distribution: Kurtosis < 0
    Mesokurtic Distribution: Kurtosis = 0
    Leupokurtic Distribution: Kurtosis > 0
    
    """
    if type(cols) == str:
        skew = df[cols].skew()
        kurtosis = df[cols].kurtosis()
        if skew == 0:
            print("Skew: %f. This represents a Normal Skew where Mean = Median." % skew)
        elif skew < 0:
            print("Skew: %f. This represents a Negative Skew / Right Modal / Left-Skew where the Median is greater than Mean." % skew)
        else:
            print("Skew: %f. This represents a Positive Skew / Left Modal / Right-Skew where the Median is less than Mean." % skew)
        
        if kurtosis == 0:
            print("Kurtosis: %s. This represents a Mesokurtic (Normal) Distribution." % kurtosis)
        elif kurtosis < 0:
            print("Kurtosis: %s. This represents a Platykurtic (Fat) Distribution." % kurtosis)
        else:
            print("Kurtosis: %s. This represents a Leupokurtic (Skinny) Distribution." % kurtosis)
    
    else:
        for col in cols:
            print(col)
            skew = df[col].skew()
            kurtosis = df[col].kurtosis()
            if skew == 0:
                print("Skew: %f. This represents a Normal Skew where Mean = Median." % skew)
            elif skew < 0:
                print("Skew: %f. This represents a Negative Skew / Right Modal / Left-Skew where the Median is greater than Mean." % skew)
            else:
                print("Skew: %f. This represents a Positive Skew / Left Modal / Right-Skew where the Median is less than Mean." % skew)

            if kurtosis == 0:
                print("Kurtosis: %s. This represents a Mesokurtic (Normal) Distribution." % kurtosis)
            elif kurtosis < 0:
                print("Kurtosis: %s. This represents a Platykurtic (Fat) Distribution." % kurtosis)
            else:
                print("Kurtosis: %s. This represents a Leupokurtic (Skinny) Distribution." % kurtosis)
            print()
    return None

In [None]:
# Encode the Features using LabelEncoder()
def encode(col, df):
    """
    This function encodes the categorical columns.
    
    Parameters
    ----------
    col: string-like
    This specifies the column to be encoded.
    
    df: pandas DataFrame
    This specifies the DataFrame t
    """
    encoder = LabelEncoder()
    
    df[col] = encoder.fit_transform(df[col])
    
    print(f"Classes for {col}:",encoder.classes_)
    
    return df

In [None]:
# Build Function to Compare PMCC
def correlation(cols, target, df):
    """
    This function prints the correlation between column(s) and the target.
    
    Parameters
    ----------
    cols: string-like or array-like
    This specifies the columns to test against the target.
    
    target: string-like
    This specifies the column to be tested with columns.
    
    df: pandas DataFrame
    This specifies the DataFrame.
    
    Returns
    -------
    None
    """
    if type(cols) == str:
        if cols != target: 
            print(f"Correlation between {target} and {cols}:", df[target].corr(df[cols]))
    else:
        for col in cols:
            if col == target: continue
            print(f"Correlation between {target} and {col}:", df[target].corr(df[col]))
    return None

In [None]:
# Linear Regression Function 

def linreg(X, y, df, testsize=0.25):
    """
    This function encapsulates the pipeline for using Linear Regression and predictors to predict the SalePrice.
    
    Parameters
    ----------
    X: array-like 
    This parameter contains the predictors to be used for predicting y.
    
    y: string
    This parameter specifies the output column.
    
    df: pandas DataFrame or panda Series.
    This parameter specifies the data.
    
    testsize: float or integer greater than 1.
    This parameter specifies the train_test_split size.
    
    Returns
    -------
    coeff: float-like
    This represents the coefficient of the best fit line (y=aX+c).
    
    intercept: float-like
    This represents the intercept of the best fit line (y=aX+c).
    
    pred_train: Array-like
    This represents the model predictions on X_train. 
    
    pred_test: Array-like
    This represents the model predictions on X_test.
    
    train_r2: float-like; between 0 and 1
    This value is the Explain Variance for the training data. It measures the goodness of fit on training data. 
    Explained Variance suggests how much of the data can be explained by the model.
    
    test_r2: float-like; between 0 and 1
    This value is the Explain Variance for the testing data. It measures the goodness of fit on testing data. 
    Explained Variance suggests how much of the data can be explained by the model.
    
    train_mse: float-like
    This value represents the mean square error on training data.
    
    
    test_mse: float-like
    This value represents the mean square error on testing data.
    
    X_train: array-like
    This represents the input data used in training the model.
    
    X_test: array-like
    This represents the input data used in testing the model.
    
    y_train: array-like
    This represents the output data used in training the model.
    
    y_test: array-like
    This represents the output data used in testing the model.
    """
    
    # Define Input and Output Variables
    if type(X) != list:
        X = df[[X]]
    else:
        X = df[X]
    
    y = df[[y]]
    
    # Train Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testsize)
    
    # Initialise Linear Regression Model
    lr = LinearRegression()
    
    # Fit Model
    lr.fit(X_train, y_train)
    coeff, intercept = lr.coef_, lr.intercept_
    
    # Predict Model
    pred_train = lr.predict(X_train)
    pred_test = lr.predict(X_test)
    
    # Metrics 
    train_r2 = lr.score(X_train, y_train)
    test_r2 = lr.score(X_test, y_test)
    
    train_mse = mean_squared_error(y_train, pred_train)
    test_mse = mean_squared_error(y_test, pred_test)
    
    return coeff, intercept, pred_train, pred_test, train_r2, test_r2, train_mse, test_mse, X_train, X_test, y_train, y_test

In [None]:
# Visualise the predictions
# 1. print out metrics scores
# 2. display best-fit lines on the train and test data

def visualiseModel(col, y, df, coeff, intercept, pred_train, pred_test, train_r2, test_r2, train_mse, test_mse, X_train, X_test, y_train, y_test):
    """
    This function visualises the model's training and predictions. It prints out the metrics (R2 and MSE) scores as well as
    displays the best-fit lines on the train and test data.
    
    Parameters
    ----------
    col: string or array-like
    This represents the column(s) that was used in training and evaluting the model. 
    
    y: string-like
    This specifies the response variable.
    
    df: pandas DataFrame
    This represents the DataFrame.
    
    coeff: float-like
    This represents the coefficient of the best fit line (y=aX+c).
    
    intercept: float-like
    This represents the intercept of the best fit line (y=aX+c).
    
    pred_train: Array-like
    This represents the model predictions on X_train. 
    
    pred_test: Array-like
    This represents the model predictions on X_test.
    
    train_r2: float-like; between 0 and 1
    This value is the Explain Variance for the training data. It measures the goodness of fit on training data. 
    Explained Variance suggests how much of the data can be explained by the model.
    
    test_r2: float-like; between 0 and 1
    This value is the Explain Variance for the testing data. It measures the goodness of fit on testing data. 
    Explained Variance suggests how much of the data can be explained by the model.
    
    train_mse: float-like
    This value represents the mean square error on training data.
    
    test_mse: float-like
    This value represents the mean square error on testing data.
    
    X_train: array-like
    This represents the input data used in training the model.
    
    X_test: array-like
    This represents the input data used in testing the model.
    
    y_train: array-like
    This represents the output data used in training the model.
    
    y_test: array-like
    This represents the output data used in testing the model.
    
    Returns
    -------
    None
    """
    print(f"Linear Regression with {col}")
    
    # Print out metrics score
    print("TRAINING")
    print("train_r2:", train_r2)
    print("train_mse:", train_mse)
    print('\nTESTING')
    print("test_r2:", test_r2)
    print("test_mse", test_mse)
    print()
    
    # Print equations of Best Fit Lines
    if type(col) == list:
        data = {
            'coef_': coeff[0],
            'intercept_': intercept,
        }
        print(pd.DataFrame(data, index=col))
    else:
        print(f"Best Fit Line Equation for {col}: y = %f * {col} + %f" % (coeff[0], intercept))
    print()
    
    if not type(col) == list:
        # Display Fits
        f, axes = plt.subplots(nrows=1, ncols=2, figsize=(20,10))

        # Display Training
        axes[0].scatter(X_train[[col]], y_train[y], s=20, alpha=0.5)
        axes[0].plot(X_train[[col]], coeff[0] * X_train + intercept, 'r-', label='Best Fit Line')
        axes[0].set_title(f"Training Fit for {col}")
        
        # Display Testing
        axes[1].scatter(X_test[[col]], y_test[y], s=20, alpha=0.5)
        axes[1].plot(X_test[[col]], coeff[0] * X_test + intercept, 'r-', label='Best Fit Line')
        axes[1].set_title(f"Testing Fit for {col}")
        
        plt.legend()
    else:
        print("Too many cols to plot!")
    
    
    return None

In [None]:
# How do I go about transforming the data?
def logTransform(col):
    """
    This function transforms the right-skew (non-zero) data to something more normally distributed. 
    - Strong algorithm for shifting left skew data.
    - Does not work well with zero values.
    
    Parameters
    ----------
    col: pandas Series 
    This specifies the data to be manipulated.
    
    Returns
    -------
    np.log(col): pandas Series
    Log Transformed data.
    """
    return np.log(col)

def squareRootTransform(col):
    """
    This function transforms the right-skew data to something more normally distributed. 
    - Weaker algorithm as compared to log or power transformation functions. 
    - Works for zero values.
    
    Parameters
    ----------
    col: pandas Series 
    This specifies the data to be manipulated.
    
    Returns
    -------
    np.sqrt(col): pandas Series
    Square Root Transformed data.
    
    """
    return np.sqrt(col)

def powerTransform(col):
    """
    This function transforms a left-skew data to something more normally distributed. 
    - Works for zero values.
    
    Parameters
    ----------
    col: pandas Series 
    This specifies the data to be manipulated.
    
    Returns
    -------
    pow(col, 2): pandas Series
    Power Two Transformed data.
    """
    return pow(col, 2)

In [None]:
def preprocessingOutlier(col, df, threshold=0.05):
    """
    This function removes outliers from the data. It only removes the outliers (below min_bound and above max_bound) if the 
    number of outliers constitutes less than a threshold (default = 0.05 (5%)) of the entire DataFrame (df).
    
    Parameters
    ----------
    col: string-type
    This specifies the col in the DataFrame to check and remove (if applicable) outliers.
    
    df: pandas DataFrame or pandas Series
    
    threshold: float-like
    This specifies the percentage threshold where outliers should be removed. Removing outliers might not always be a good
    choice as it might remove important information about the dataset.
    
    Returns
    -------
    df_copy: pandas DataFrame or pandas Series
    Preprocessed pandas DataFrame or pandas Series.
    """
    df_copy = df.copy()
    
    # Check percentage of outliers 
    column = df_copy[col]
    q1 = np.percentile(column, 25)
    q3 = np.percentile(column, 75)
    iqr = q3 - q1
    min_bound = q1 - 1.5 * iqr
    max_bound = q3 + 1.5 * iqr
    
    outliers_df = df_copy[(df_copy[col] <= min_bound) | (df_copy[col] >= max_bound)]
    outlier_counts = outliers_df.shape[0]
    outlier_percentage = outlier_counts / df.shape[0]
    
    if outlier_percentage <= threshold:
        # Remove Outliers
        print("%s Outlier Percentage is %.3f, that is less than or equals to the threshold value of %f" % (col, outlier_percentage, threshold))
        df_copy.drop(labels=outliers_df.index, inplace=True)
    else:
        # Don't Remove Outliers
        print("%s Outlier Percentage is %.3f, that is more than the threshold value of %f" % (col, outlier_percentage, threshold))
    return df_copy

In [None]:
def preprocessingSkew(col, threshold = 1):
    """
    This function decides if transformation functions should be applied to the given column based on the skew of the column and
    the threshold value.
    
    Parameters
    ----------
    col: pandas Series
    This specifies the data to be tested and skewed if necessary.
    
    threshold: float-like or int-like (positive value only)
    This specifies the threshold for which a given column will be transformed. If the skew of the given column is greater than
    the threshold, transformation function will be applied to the data. 
    
    Returns
    -------
    col: pandas Series
    Transformed (or not) col.
    
    
    """
    skew = col.skew()
    if skew >= threshold or skew <= -threshold:
        if skew < 0:
            # Left Skew 
            col = powerTransform(col)
        else:
            # Right Skew
            if np.min(col) <= 0:
                col = squareRootTransform(col)
            else:
                col = logTransform(col)
    return col

In [None]:
# Build a Pipeline that includes preprocessing and linear regression
def pipeline(X, y, df, testsize=0.25, thresholdOutlier=0.05, thresholdSkew=1):
    """
    This function is a pipeline that includes training and testing with and without preprocessing to test evaluate the 
    goodness of fit of the model.
    
    Original DataFrame is not updated by the preprocessing function.
    
    Parameters
    ----------
    X: array-like or string-like
    This specifies the predictor variable(s) (input variable(s)).
    
    y: string-like
    This specifies the response variable (output variable).
    
    df: pandas DataFrame
    This specifies the DataFrame.
    
    testsize: float-like
    This specifies the test size for train_test_split.
    
    thresholdOutlier: float-like
    This specifies the threshold for removing outliers.
    
    thresholdSkew: float-like or int-like
    This specifies the threshold for skewing of data.
    
    Returns
    -------
    None
    """    
    # Show type of Linear Regression
    if type(X) == str:
        print("Performing Univariate Linear Regression\n")
    else:
        print("Performing Multivariate Linear Regression\n")
        
    # Defining Variables for Visualisation
    train_r2_list = []
    test_r2_list = []
    X_LABELS = X
    
    # Check for Categorical Data
    categorical_columns = list(df.select_dtypes('category'))
    
    # Get Results for Not Processed Data
    coeff, intercept, pred_train, pred_test, train_r2, test_r2, train_mse, test_mse, X_train, X_test, y_train, y_test = linreg(X, y, df, testsize)
    visualiseModelV2(X, df, coeff, intercept, pred_train, pred_test, train_r2, test_r2, train_mse, test_mse, X_train, X_test, y_train, y_test)
    train_r2_list.append(train_r2)
    test_r2_list.append(test_r2)
    
    # Preprocessed Data
    df_1 = df.copy()  # DataFrame for Removing Outliers 
    df_2 = df.copy()  # DataFrame for Normalising Data
    df_3 = df.copy() # DataFrame for Removing Outliers + Normalising Data
    
    # Removing Outliers 
    print("*****REMOVE OUTLIERS*****")
    if type(X) == str and not X in categorical_columns:
        df_1 = preprocessingOutlier(X, df_1, thresholdOutlier)
    else:
        for col in X:
            if col in categorical_columns: continue
            df_1 = preprocessingOutlier(col, df_1, thresholdOutlier)
    
    coeff, intercept, pred_train, pred_test, train_r2, test_r2, train_mse, test_mse, X_train, X_test, y_train, y_test = linreg(X, y, df_1, testsize)
    visualiseModelV2(X, df_1, coeff, intercept, pred_train, pred_test, train_r2, test_r2, train_mse, test_mse, X_train, X_test, y_train, y_test)
    train_r2_list.append(train_r2)
    test_r2_list.append(test_r2)
    
    # Normalising Data
    print("*****NORMALISING DATA*****")
    if type(X) == str and not X in categorical_columns:
        df_2[X] = preprocessingSkew(df_2[X], thresholdSkew)
    else:
        for col in X:
            if col in categorical_columns: continue
            df_2[col] = preprocessingSkew(df_2[col], thresholdSkew)
    coeff, intercept, pred_train, pred_test, train_r2, test_r2, train_mse, test_mse, X_train, X_test, y_train, y_test = linreg(X, y, df_2, testsize)
    visualiseModelV2(X, df_2, coeff, intercept, pred_train, pred_test, train_r2, test_r2, train_mse, test_mse, X_train, X_test, y_train, y_test)
    train_r2_list.append(train_r2)
    test_r2_list.append(test_r2)
    
    # Removing Outliers + Normalising Data
    print("*****REMOVING OUTLIER + NORMALISING DATA*****")
    if type(X) ==str and not X in categorical_columns:
        df_3 = preprocessingOutlier(X, df_3, thresholdOutlier)
        df_3[X] = preprocessingSkew(df_3[X], thresholdSkew)
    else:
        for col in X:
            if col in categorical_columns: continue
            df_3 = preprocessingOutlier(col, df_3, thresholdOutlier)
        for col in X:
            if col in categorical_columns: continue
            df_3[col] = preprocessingSkew(df_3[col], thresholdSkew)
    coeff, intercept, pred_train, pred_test, train_r2, test_r2, train_mse, test_mse, X_train, X_test, y_train, y_test = linreg(X, y, df_3, testsize)
    visualiseModelV2(X, df_3, coeff, intercept, pred_train, pred_test, train_r2, test_r2, train_mse, test_mse, X_train, X_test, y_train, y_test)
    train_r2_list.append(train_r2)
    test_r2_list.append(test_r2)
    
    # Visualise Change in R2    
    f, axes = plt.subplots(nrows=1, ncols=2, figsize=(15,5))
    
    axes[0].plot(train_r2_list, 'o-')
    axes[0].set_title("Change in Train R2")
    
    axes[1].plot(test_r2_list, 'o-')
    axes[1].set_title("Change in Test R2")

    return train_r2_list, test_r2_list

In [None]:
def visualiseModelV2(col, df, coeff, intercept, pred_train, pred_test, train_r2, test_r2, train_mse, test_mse, X_train, X_test, y_train, y_test):
    """
    This function visualises the model's training and predictions. It prints out the metrics (R2 and MSE).
    
    Parameters
    ----------
    col: string or array-like
    This represents the column(s) that was used in training and evaluting the model. 
    
    df: pandas DataFrame
    This represents the DataFrame.
    
    coeff: float-like
    This represents the coefficient of the best fit line (y=aX+c).
    
    intercept: float-like
    This represents the intercept of the best fit line (y=aX+c).
    
    pred_train: Array-like
    This represents the model predictions on X_train. 
    
    pred_test: Array-like
    This represents the model predictions on X_test.
    
    train_r2: float-like; between 0 and 1
    This value is the Explain Variance for the training data. It measures the goodness of fit on training data. 
    Explained Variance suggests how much of the data can be explained by the model.
    
    test_r2: float-like; between 0 and 1
    This value is the Explain Variance for the testing data. It measures the goodness of fit on testing data. 
    Explained Variance suggests how much of the data can be explained by the model.
    
    train_mse: float-like
    This value represents the mean square error on training data.
    
    test_mse: float-like
    This value represents the mean square error on testing data.
    
    X_train: array-like
    This represents the input data used in training the model.
    
    X_test: array-like
    This represents the input data used in testing the model.
    
    y_train: array-like
    This represents the output data used in training the model.
    
    y_test: array-like
    This represents the output data used in testing the model.
    
    Returns
    -------
    None
    """
    print(f"Linear Regression with {col}")
    
    # Print out metrics score
    print("TRAINING")
    print("train_r2:", train_r2)
    print("train_mse:", train_mse)
    print('\nTESTING')
    print("test_r2:", test_r2)
    print("test_mse", test_mse)
    print()
    
    # Print equations of Best Fit Lines
    if type(col) == list:
        data = {
            'coef_': coeff[0],
            'intercept_': intercept,
        }
        print(pd.DataFrame(data, index=col))
    else:
        print(f"Best Fit Line Equation for {col}: y = %f * SalePrice + %f" % (coeff[0], intercept))
    print()
    
    return None

In [None]:
def calculateVIF(cols):
    """
    This function calculates VIF to check for multi-collinearity and displays them in a DataFrame.
    
    Parameters
    ----------
    cols: string-like or array-like
    This specifies the column(s) to check.
    
    Returns
    -------
    df: pandas DataFrame
    DataFrame that contains the VIF of the columns.
    """
    df = pd.DataFrame()
    df['Variables'] = cols.columns
    df["VIF"] = [variance_inflation_factor(cols.values, i) for i in range(cols.shape[1])]

    return df

In [None]:
def greedyFeatureSelection(X, y, df):
    """
    This function does greedy search to select the best feature combination that will yield the best result. "Best Result" is 
    user defined. In this example, we want to maximise R2 score, hence, we determine "Best Result" by choosing combinations of 
    features that yields the highest (closest to 1) R2 score.
    
    Parameters
    ----------
    X: array-like
    This represents the data.
    
    y: string-like
    This represents the response variable.
    
    df: pandas DataFrame
    This represents the dataframe.
    
    Returns
    -------
    best_score: array-like
    This contains best scores (R2).
    
    best_features: array-like
    This contains the best features that yields the maximum score (R2).
    """
    best_scores, best_features = [], []
    
    while True:
        this_feature = []
        best_score = 0
        
        for feature in X:
            if feature in best_features:
                continue
            
            selected_features = best_features + [feature]
            
            coeff, intercept, pred_train, pred_test, train_r2, test_r2, train_mse, test_mse, X_train, X_test, y_train, y_test = linreg(selected_features, y, df)
            score = max(train_r2, test_r2)
            
            if score > best_score:
                best_score = score
                this_feature = feature
                
        if this_feature != None:
            best_features.append(this_feature)
            best_scores.append(best_score)
            
        if len(best_scores) > 2:
            if best_scores[-1] < best_scores[-2]:
                break
    return best_scores[:-1], best_features[:-1]

### Load Dataset

In [None]:
df = pd.read_csv('../input/vehicle-dataset-from-cardekho/car data.csv')

### Basic EDA and Data Cleaning

In [None]:
df.head()

In [None]:
df.columns = ['name', 'year', 'selling_price', 'present_price', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner']

In [None]:
df.tail()

In [None]:
rows, cols = df.shape
print("Number of Rows:", rows)
print("Number of Columns:", cols)

In [None]:
df_columns = df.columns
print("Columns:", df_columns)

In [None]:
df.info()

There are no missing data.
<br>
There are 4 ```object``` columns and 6 ```int64```/```float64``` columns.

In [None]:
object_df = df.select_dtypes('object')
numeric_df = df.select_dtypes(['int64', 'float64'])

In [None]:
object_df.head()

In [None]:
numeric_df.head()

#### Hypothesis
1. ```selling_price``` of cars with **higher** ```km_driven``` will sell for a lower price. This is because the cars experience wear and tear.
2. ```selling_price``` of cars with manual ```transmission``` is **higher** than that of auto.
3. ```selling_price``` of cars that run on diesel ```fuel``` is **lower** thant that of fuel.
4. ```selling_price``` of cars with more ```owner```(s) is **lower**.
5. Earlier ```year``` of cars will have **higher** ```km_driven```.

### Exploratory Data Analysis
In this section, I aim to explore, extract insights and provide answers for the hypothesises above. I will perform Univariate Analysis, Bivariate Analysis and Multivariate Analysis. 
<br>
In Univariate Analysis, we are interested in knowing the **Central Tendency** and the **Spread of Data**. In Bivariate Analysis, we are interested in finding out if the variables are **Mutually Dependent** and **Correlated**. In Multivariate Analysis, we are interestesd in observing how different (combinations of) features interact with one another.

### Univariate Analysis and Visualisation

In [None]:
object_df.head()

In [None]:
for col in object_df.columns[1:]:
    uniqueness(col, object_df)
    print()

In [None]:
# Visualisation 
f, axes = plt.subplots(nrows=len(object_df.columns[1:]), ncols=1, figsize=(7,12))

for i in range(len(object_df.columns[1:])):
    g = sns.countplot(x=object_df.columns[1:][i], data=object_df, ax=axes[i])
    g.set_title(f"")
plt.tight_layout()

#### Analyse and Visualise ```name``` 
- Preprocess text data
    - Lowercase 
    - Remove words with numbers
    - Remove white spaces
    - tokenise and create sparse matrix
    - Remove stop words (if any)
- Create wordcloud 
- Create freqdist and plot

In [None]:
# Unique Values Set A
uniqueness('name', object_df[:4340])

In [None]:
# Unique Values Set B
uniqueness('name', object_df[4340:])

In [None]:
def preprocessLowerCase(string):
    return string.lower()

def preprocessRemoveStopWords(string, stopwords):
    if isinstance(string, str):
        # Tokenise
        string = string.split(' ')
    
    for word in string:
        if word in stopwords:
            word = ''
    return ' '.join(string)

def preprocessRemoveWhiteSpace(string):
    if isinstance(string, str):
        # Tokenise
        string = string.split(' ')
    
    return ' '.join([i for i in string if i != ''])

def preprocessRemoveNumbersWord(string):
    return re.sub(r'\w*\d\w*', '', string).strip()

def preprocessPunctuation(string):
    return ''.join([i for i in string if not i in punctuation])

def processText(string):
    nltk_stopwords = stopwords.words('english')
    string = preprocessLowerCase(string)
    string = preprocessRemoveWhiteSpace(string)
    string = preprocessRemoveNumbersWord(string)
    string = preprocessPunctuation(string)
    string = preprocessRemoveWhiteSpace(string)
    string = preprocessRemoveStopWords(string, nltk_stopwords)
    return string
name = df['name']

In [None]:
# CREATE STOPWORDS LIBRARY 
nltk_stopwords = stopwords.words('english')

In [None]:
name = name.apply(processText)

In [None]:
name

In [None]:
corpus = ' '.join(list(name))

In [None]:
wc = WordCloud(min_font_size=5, background_color= 'white').generate(corpus)
plt.figure(figsize=(20,20))
plt.imshow(wc, interpolation= 'bilinear')
plt.axis('off')

In [None]:
freqDist = FreqDist(corpus.split(' '))
freqDist.most_common()

Further processing
- Remove words with less then 3 letters

In [None]:
def removeShortWords(string):
    if isinstance(string, str):
        string = string.split(" ")
    
    return ' '.join([word for word in string if len(word) >= 4])

name = name.apply(removeShortWords)
corpus = ' '.join(list(name))
wc = WordCloud(min_font_size=5, background_color= 'white').generate(corpus)
plt.figure(figsize=(20,20))
plt.imshow(wc, interpolation= 'bilinear')
plt.axis('off')

In [None]:
freqDist = FreqDist(corpus.split(' '))
freqDist.most_common(50)

In [None]:
numeric_df.head()

In [None]:
numeric_df[['selling_price', 'km_driven']].describe()

In [None]:
df[df['selling_price'] == 0.1]

Seems like the dataset is not limited to 4 wheeled vehicles! 
<img src='bajaj-pulsar-150-black-red.png'>
<center>Picture of a Binjaj Pulsar 150</centre>

In [None]:
f, axes = plt.subplots(nrows=2, ncols=2, figsize=(15,7))
cols = numeric_df[['selling_price', 'km_driven']].columns

for i in range(2):
    g1 = sns.boxplot(x=cols[i], data= numeric_df, ax=axes[i, 0])
    g1.set_title(f"Boxplot of {cols[i]}")
    
    g2 = sns.histplot(x=cols[i], data= numeric_df, ax=axes[i, 1], kde=True)
    g2.set_title(f"Histplot and KDEplot of {cols[i]}")
    
plt.tight_layout()

In [None]:
skewKurtosis(cols, numeric_df)

#### Analyse and Visualise ```year```
- Earliest Year 
- Latest Year
- Spread of Years

In [None]:
year = numeric_df['year']

In [None]:
min_year = np.min(year)
max_year = np.max(year)
print("Min Year:", min_year)
print("Max Year:", max_year)
print("Range:", max_year - min_year, 'years')

In [None]:
f, axes = plt.subplots(nrows=3, ncols=1, figsize=(15,10))

g1 = sns.histplot(year, ax=axes[0])
g1.set_title('Distributions of year')

g2 = sns.kdeplot(year, ax=axes[1], color='r')
g2.set_title('KDEplot of year')

g3 = sns.boxenplot(year, ax=axes[2], color='g')
g3.set_title("Boxenplot of year")

plt.tight_layout()

In [None]:
# Central Tendencies and Spread of Year
year.describe()

We can expect the skew to be negative and kurtosis to be platykurtic! 

In [None]:
skewKurtosis('year', numeric_df)

Oh wait... never mind it is Leupokurtic!

### Bivariate Analysis and Visualisation
In this section, I aim to provide answers for the hypothesis that I drafted above!
>```selling_price``` of vehicles with **higher** ```km_driven``` will sell for a lower price. This is because the vehicles experience wear and tear.
<br>
>```selling_price``` of vehicles with manual ```transmission``` is **higher** than that of auto.
<br>
>```selling_price``` of vehicles that run on diesel ```fuel``` is **lower** thant that of fuel.
<br>
>```selling_price``` of vehicles with more ```owner```(s) is **lower**.
<br>
>Earlier ```year``` of vehicles will have **higher** ```km_driven```.
<br>

Note: I changed "cars" to "vehicles" because I found out that the dataset is not limited to 4-wheeled vehicles (cars).

#### Hypothesis 1
```selling_price``` of vehicles with **higher** ```km_driven``` will sell for a lower price. This is because the vehicles experience wear and tear.

In [None]:
sns.jointplot(x='selling_price', y='km_driven', data=df, 
              height=7, space=0, alpha=0.7, kind='hex', cmap='BuGn')

In [None]:
corr = df['selling_price'].corr(df['km_driven'])
print("Correlation between selling_price and km_driven:", corr)

PMCC is relatively close to 0 and it's magnitude is negative. This indicates a weak (almost negligible) negative correlation between ```selling_price``` and ```km_driven```. Therefore, vehicles with **higher** ```km_driven``` does not translate to **lower** ```selling_price```.

#### Hypothesis 2
```selling_price``` of vehicles with manual ```transmission``` is **higher** than that of auto.

In [None]:
plt.figure(figsize=(10,5))
g = sns.countplot(x='transmission', data=df)
g.set_title("Countplot of Transmission")

There are more Manual cars as compared to Automatic cars.

In [None]:
f, axes = plt.subplots(nrows=3, ncols=1, figsize=(10,15))

# KDEplot
g1 = sns.kdeplot(x='selling_price', data=df, hue='transmission', ax=axes[0])
g1.set_title('KDEplot of selling_price and transmission')

# Boxplot
g2 = sns.boxplot(x='selling_price', y='transmission', data=df, ax=axes[1])
g2.set_title('Boxplot of selling_price and transmission')

# Histplot
g3 = sns.histplot(x='selling_price', data=df, hue='transmission', ax=axes[2])
g3.set_title('Histplot of selling_price and transmission')

From the plots, we can observe that the ```selling_price``` for Automatic ```transmission``` is generally higher than that of Manual ```transmission```.

In [None]:
auto = df[df['transmission'] == 'Automatic']
manual = df[df['transmission'] == 'Manual']

def comparison(auto, manual, comparison):
    auto_selling_price = auto[comparison]
    manual_selling_price = manual[comparison]
    
    # Auto
    min_auto = np.min(auto_selling_price)
    max_auto = np.max(auto_selling_price)
    range_auto = max_auto - min_auto
    mean_auto = np.mean(auto_selling_price) 
    median_auto = np.median(auto_selling_price) 
    
    # Manual
    min_manual = np.min(manual_selling_price)
    max_manual = np.max(manual_selling_price)
    range_manual = max_manual - min_manual
    mean_manual = np.mean(manual_selling_price)
    median_manual = np.median(manual_selling_price)
    
    manual_list = [min_manual, max_manual, range_manual, mean_manual, median_manual]
    auto_list = [min_auto, max_auto, range_auto, mean_auto, median_auto]
    attributes = ['Min', 'Max', 'Range', 'Mean', 'Median']
    
    for i in range(len(attributes)):
        if manual_list[i] > auto_list[i]:
            print(f'Manual Tranissmion has a higher {attributes[i]} {comparison}:', manual_list[i])
        else:
            print(f'Auto Transmission has a higher {attributes[i]} {comparison}:', auto_list[i])
    
    return None
comparison(auto, manual, 'selling_price')

We can conclude that the ```selling_price``` of Automatic ```transmission``` is generally **higher** than that of Manual ```transmission```.

#### Hypothesis 3
```selling_price``` of vehicles that run on diesel ```fuel``` is **lower** thant that of fuel.

In [None]:
plt.figure(figsize=(10,5))
g = sns.countplot(x='fuel', data=df)
g.set_title("Countplot of Transmission")

There are more Petrol ```fuel``` vehicles as compared to that of Diesel ```fuel```.

In [None]:
f, axes = plt.subplots(nrows=3, ncols=1, figsize=(10,15))

# KDEplot
g1 = sns.kdeplot(x='selling_price', data=df, hue='fuel', ax=axes[0])
g1.set_title('KDEplot of selling_price and fuel')

# Boxplot
g2 = sns.boxplot(x='selling_price', y='fuel', data=df, ax=axes[1])
g2.set_title('Boxplot of selling_price and fuel')

# Histplot
g3 = sns.histplot(x='selling_price', data=df, hue='fuel', ax=axes[2])
g3.set_title('Histplot of selling_price and fuel')

From the plots, we can conclude that the ```selling_price``` of Diesel ```fuel``` vehicles is generally **higher** than that of Petrol ```fuel```! 
<br>
While there are other ```fuel``` type vehicles, we are not interested in them as they are not as significant as Petrol and Diesel ```fuel``` type vehicles.

#### Hypothesis 4
```selling_price``` of vehicles with more ```owner```(s) is **lower**.

In [None]:
f, axes = plt.subplots(nrows=3, ncols=1, figsize=(10,15))

# KDEplot
g1 = sns.kdeplot(x='selling_price', data=df, hue='owner', ax=axes[0])
g1.set_title('KDEplot of selling_price and owner')

# Boxplot
g2 = sns.boxplot(y='selling_price', x='owner', data=df, ax=axes[1])
g2.set_title('Boxplot of selling_price and owner')

# Histplot
g3 = sns.histplot(x='selling_price', data=df, hue='owner', ax=axes[2])
g3.set_title('Histplot of selling_price and owner')

We can observe a general downward ```selling_price``` trend as the number of owners increase!

In [None]:
# year and selling_price
plt.figure(figsize=(15, 10))
g = sns.swarmplot(x='year', y='selling_price', data=df)

We can observe that there is an increase in ```selling_price``` over the years! 

In [None]:
corr = df['selling_price'].corr(df['year'])

In [None]:
corr

#### Hypothesis 5
Earlier ```year``` of vehicles will have **higher**  ```km_driven```.

In [None]:
plt.figure(figsize=(15,15))
sns.boxplot(y='km_driven', x='year', data=df)

In [None]:
corr = df['year'].corr(df['km_driven'])
corr

PMCC indicates a moderate negative correlation between ```year``` and ```km_driven```. As ```year``` increases, ```km_driven``` decreases! Hence, we can support our hypothesis!

In [None]:
# Correlation Heatmap
corr = df.corr()

plt.figure(figsize=(10,10))
g = sns.heatmap(corr, annot=True, vmin=-1, vmax=1, fmt='0.2f')

In [None]:
# Encode Categorical Features
df_copy = df.copy()
for col in ['fuel', 'seller_type', 'transmission', 'owner']:
    df_copy = encode(col, df_copy)

In [None]:
# Correlation Heatmap
corr = df_copy.corr()

plt.figure(figsize=(10,10))
g = sns.heatmap(corr, annot=True, vmin=-1, vmax=1, fmt='0.2f')

```selling_price``` seem to have pretty low correlation with all the other features....

In [None]:
df_copy.columns

In [None]:
for col in ['year', 'km_driven', 'fuel', 'seller_type', 'transmission', 'owner']:
    correlation(col, 'selling_price', df_copy)

```selling_price``` seem to have pretty low correlation with all the other features....

In [None]:
plt.figure(figsize=(10,7))
sns.boxplot(y='selling_price', x='seller_type', data=df)

Welp. ```selling_price``` of Trustmark Dealer is **higher** than Dealer which is in turn **higher** than that of Individual! Well.... commission-based salary scheme could the result of this! Dealers will generally mark up the price to earn more commission for themselves!
<br><br>
Note: Buy from individual sellers!

In [None]:
plt.figure(figsize=(10,7))
sns.boxplot(y='km_driven', x='fuel', data=df)

We can observe that vehicles that run on Diesel ```fuel``` generally have further ```km_driven```! Diesel ```fuel``` is cheaper as compared to Petrol.. This might explain the trend!

### TODO (DONE BELOW)
Can we improve the correlation between ```selling_price``` and other features in the DataFrame by doing **Feature Engineering** and/or **Feature Preprocessing**?
<br><br>
**Feature Engineering**
<br>
Use other existing features to create new features that may provide us with more valuable information.
<br><br>
**Feature Preprocessing** (Done in pipeline())
<br>
1. Remove Outliers
2. Normal (Gaussian) Transforms
3. Scaling to StandardScaler

### Multivariate Analysis and Visualisation
Some questions that I want to answer in mutlivariate analysis!

1. Is there a relationship between ```selling_price```, ```km_driven``` and ```fuel```?
2. Is there a relationship between ```selling_price```, ```owner``` and ```km_driven```?
3. Is there a relationship between ```km_driven```, ```fuel``` and ```transmission```?

In [None]:
# Explore Relationship between km_driven, fuel and transmission
f, axes = plt.subplots(nrows=1, ncols=2, figsize=(15,5))

g1 = sns.boxplot(x='km_driven', y='fuel', hue='transmission', data=df, ax=axes[0])
g1.set_title("Boxplot between km_driven, fuel and transmission")
g2 = sns.boxplot(x='km_driven', y='transmission', hue='fuel', data=df, ax=axes[1])
g2.set_title("Boxplot between km_driven, fuel and transmission")

plt.tight_layout()

From the plots, we can make some observations:
1. There are no manual electric cars.
2. ```km_driven``` on Automatic ```transmission``` is generally less as compared to that of Manual ```transmission```.

In [None]:
df[df['fuel'] == 'Electric']

In [None]:
# Explore Relationship between selling_price, km_driven and fuel
plt.figure(figsize=(15, 7))

g = sns.scatterplot(x='selling_price', y='km_driven', hue='fuel', data=df, size='fuel', alpha=0.7)
g.set_title("Group Scatterplot between selling_price, km_driven and fuel")

There isn't an apparent relationship between the 3 variables...

In [None]:
# Explore Relationship between selling_price, km_driven and owner
plt.figure(figsize=(15, 7))

g = sns.scatterplot(x='selling_price', y='km_driven', hue='owner', data=df, size='owner', alpha=0.7)
g.set_title("Group Scatterplot between selling_price, km_driven and owner")

There isn't an apparent relationship between the 3 variables...

In [None]:
df.columns

```python
sns.relplot(
    data=tips, x="total_bill", y="tip",
    col="time", hue="day", style="day",
    kind="scatter"
)
```

In [None]:
# Explore Relationship between selling_price, km_driven, transmission and fuel
sns.relplot(x='selling_price', y='km_driven', col='transmission', hue='fuel',
            kind='scatter', data=df, alpha=0.5, height=10, size='fuel')

From the plots, we can observe:
1. Diesel ```fuel``` vehicles generally have higher ```selling_price``` and ```km_driven```. This trend is observed in both Automatic and Manual ```transmission```.
2. There is significantly less data points in Automatic ```transmission``` as compared to that of Manual ```transmission```.

In [None]:
# Explore Relationship between selling_price, km_driven, transmission and owner
sns.relplot(x='selling_price', y='km_driven', col='transmission', hue='owner',
            kind='scatter', data=df, alpha=0.5, height=10, size='owner')

From the plots, we can observe:
1. First Owner ```owner``` have generally lower ```selling_price``` and ```km_driven```. This is observed in both ```transmission``` types.
2. As expected, ```km_driven``` for Manual ```transmission``` is generally higher as compared to that of Automatic ```transmission``.

### Feature Engineering

In [None]:
df.columns

In [None]:
# Age of Vehicle (as of 2021)
age = 2021 - df['year']

In [None]:
f, axes = plt.subplots(nrows=1, ncols=3, figsize=(15,5))

g1 = sns.histplot(age, ax=axes[0])
g1.set_title("Histplot of Age")

g2 = sns.kdeplot(age, ax=axes[1], color='r')
g2.set_title("KDEplot of Age")

g3 = sns.boxplot(age, ax=axes[2], color='g')
g3.set_title("Boxplot of Age")

plt.tight_layout()

In [None]:
df['age'] = age

In [None]:
df_copy = df.copy()

### Feature Preprocessing

In [None]:
# Encode Categorical Columns
for col in ['fuel', 'seller_type', 'transmission', 'owner']:
    df_copy = encode(col, df_copy)

In [None]:
X = df_copy.drop(columns=['selling_price', 'name'])
y = df_copy[['selling_price']]

In [None]:
X.head()

In [None]:
y.head()

### Greedy Feature Selection and Some Fun :D
This section is merely for fun, please see read!

In [None]:
X = list(X.columns)
y = 'selling_price'

In [None]:
bestest_of_the_best_features = []

for i in range(25):
    best_scores, best_features=greedyFeatureSelection(X, y, df_copy)
    bestest_of_the_best_features.append(best_features)
    print(best_scores)
    print(best_features)
    print()

In [None]:
bestest_of_the_best_features = [i for j in bestest_of_the_best_features for i in j]
Counter(bestest_of_the_best_features).most_common()

The highest counts would be the best predictor among the other features!

In [None]:
pipeline(X, 'selling_price', df_copy)

### Linear Regression

In [None]:
# Univariate Regression 
for col in X:
    coeff, intercept, pred_train, pred_test, train_r2, test_r2, train_mse, test_mse, X_train, X_test, y_train, y_test = linreg(col, 'selling_price', df_copy)
    visualiseModel(col, 'selling_price', df_copy, coeff, intercept, pred_train, pred_test, train_r2, test_r2, train_mse, test_mse, X_train, X_test, y_train, y_test)

In [None]:
# Mutli-Variate Regression 
coeff, intercept, pred_train, pred_test, train_r2, test_r2, train_mse, test_mse, X_train, X_test, y_train, y_test = linreg(X, 'selling_price', df_copy)
visualiseModel(X, 'selling_price', df_copy, coeff, intercept, pred_train, pred_test, train_r2, test_r2, train_mse, test_mse, X_train, X_test, y_train, y_test)

### Conclusion

In this notebook, I exemplied the use of Linear Regression (though it might not be the best model) to predict ```selling_price``` of the vehicle! I aim to maximise R2 Score (Explained Variance). 

- If you have any questions, please post it in the comments!
- Please suggest some improvements for me!
- Please upvote if you found it useful!

Have a nice day :D