In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# Reading and Understanding Dataset

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("/kaggle/input/laptop-price-prediction-cleaned-dataset/laptop_data_cleaned.csv")
df.head()

In [None]:
# Shape of the dataset
df.shape

In [None]:
df.info()

In [None]:
cat_cols = df.select_dtypes(include='object').columns.to_list()
num_cols = df.select_dtypes(include='number').columns.to_list()
cat_cols

In [None]:
num_cols

In [None]:
num_cols.remove('Price')

In [None]:
for col in cat_cols:
    plt.figure(figsize=(10,5))
    sns.countplot(df,x='Company')
    plt.title("{} Distribution".format(col))
    plt.xticks(rotation=45)
    plt.show()

In [None]:
for col in num_cols:
    plt.figure(figsize=(8,3))
    sns.kdeplot(df,x=col)
    plt.title("{} Distribution".format(col))
    plt.show()

In [None]:
for col in cat_cols:
    plt.figure(figsize=(10,5))
    sns.boxplot(data=df,x=col,y="Price")
    plt.title("Price Distribution vs {}".format(col))
    plt.xticks(rotation=90)
    plt.show()

In [None]:
for col in num_cols:
    plt.figure(figsize=(10,5))
    sns.scatterplot(data=df,x=col,y="Price")
    plt.title('Price vs {}'.format(col))
    plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,StandardScaler,OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [None]:
X = df.drop('Price',axis=1)
y = df['Price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)
print('X_train shape :', X_train.shape)
print('y_train shape :',y_train.shape)
print('X_test shape :',X_test.shape)
print('y_test shape :',y_test.shape)

In [None]:
# Building categorical pipeline
cat_pipe = Pipeline([
    ('Imputer',SimpleImputer(strategy='most_frequent')),
    ('OHE',OneHotEncoder(handle_unknown='ignore')),
    ('Scaler',StandardScaler(with_mean=False))
])

In [None]:
# Building numerical pipeline
num_pipe = Pipeline([
    ('Imputer',SimpleImputer(strategy='median')),
    ('Scaler',StandardScaler())
])

In [None]:
# Combining both pipeline
preprocessor = ColumnTransformer([
    ('Categorical_Pipeline',cat_pipe,cat_cols),
    ('Numerical_Pipeline',num_pipe,num_cols)
])

In [None]:
# Instastiate Linear Regression model
lr = LinearRegression()

In [None]:
# Create the final pipeline with the estimator (regressor)
pipeline = Pipeline([
    ('Preprocessor',preprocessor),
    ('Regressor',lr)
])

In [None]:
cv = KFold(n_splits=10, shuffle=True, random_state=1)
# evaluate the pipeline using cross validation and calculate MAE
scores = cross_val_score(pipeline, X_train, y_train, scoring='r2', cv=cv, n_jobs=-1)
# summarize the model performance
print('Mean R2 Score:', round(np.mean(scores),2))

In [None]:
# Fit the pipeline to the training data
pipeline.fit(X = X_train, y = y_train)

In [None]:
y_train_pred = pipeline.predict(X_train)

In [None]:
# r2 Score on training dataset
r2_score(y_train,y_train_pred)

In [None]:
# r2 Score on test dataset
y_test_pred = pipeline.predict(X_test)
r2_score(y_test,y_test_pred)

In [None]:
# Define a function to train different algorithms
def train_model(model):
    model_name = model.__class__.__name__
    print(model_name)
    
    pipeline = Pipeline([
        ('Preprocessor',preprocessor),    
        ('Regressor',model)
    ])
        
    # # Fit the pipeline to the training data
    pipeline.fit(X = X_train, y = y_train)
    
    y_train_pred = pipeline.predict(X_train)
    
    # r2 Score on training dataset
    r2_score_training = r2_score(y_train,y_train_pred)
    
    # r2 Score on test dataset
    y_test_pred = pipeline.predict(X_test)
    r2_score_testing = r2_score(y_test,y_test_pred)
    
    print("Train R2 Score : ",r2_score_training)
    print("Test R2 Score : ",r2_score_testing)

In [None]:
#Linear Regression model
train_model(lr)

In [None]:
# Lets train Decision tree regrssor
dt = DecisionTreeRegressor(max_depth=4)
train_model(dt)

In [None]:
# Random Forest
rf = RandomForestRegressor(n_estimators=200,max_depth=3)
train_model(rf)

In [None]:
def train_cross_val(model,n):
    
    model_name = model.__class__.__name__
    print(model_name)
    
    pipeline = Pipeline([
        ('Preprocessor',preprocessor),    
        ('Regressor',model)
    ])
    
    
    cv = KFold(n_splits=n, shuffle=True, random_state=1)
    # evaluate the pipeline using cross validation and calculate MAE
    scores = cross_val_score(pipeline, X_train, y_train, scoring='r2', cv=cv, n_jobs=-1)
    # summarize the model performance
    print('Train Mean R2 Score:', round(np.mean(scores),2))

In [None]:
train_cross_val(lr,10)
train_cross_val(dt,10)
train_cross_val(rf,10)