### Performing Multiple Linear Regression on following data

### Importing libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Importing data

In [None]:
df = pd.read_csv("../input/diamonds/diamonds.csv")

In [None]:
df.head()

In [None]:
# Removing Unnamed column which is not use

In [None]:
df = df.drop("Unnamed: 0", axis = 1)

### Checking missing values

In [None]:
df.isna().sum()
#There is no null values here 

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# There are three categorical variables (color, clarity, depth)
# To perform MLR need to transform categorical data in continous data

In [None]:
df['color'].value_counts().plot(kind= 'bar')

In [None]:
df['cut'].value_counts().plot(kind = 'bar')

In [None]:
df['clarity'].value_counts()

In [None]:
dups = df.duplicated()
dups.sum()

In [None]:
df.hist(figsize = (20,20), bins = 150)

# Need to transform Categorical variable into continous

# Using label encoding instead of One hot enchonding 
# Beacuse there are oridnal data is given 

In [None]:
#### For Cut - fair good perimum ideal is given
#### For Color - D is best and J is worst
#### For Clarity - level of flawless FL, I3 = level 3  FL , IF VVS

In [None]:
df['cut'] = pd.Categorical(df['cut']).codes

In [None]:
df['color'] = pd.Categorical(df['color']).codes

In [None]:
df['clarity'] = pd.Categorical(df['clarity']).codes

In [None]:
df.head(2)

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(), annot=True)

In [None]:
sns.pairplot(df)

In [None]:
plt.figure(figsize =(15,10))
df.boxplot(vert = 0)

In [None]:
# There are many outliers
# Here done the outlier treatment

In [None]:
# Outliers Treatment

In [None]:
def remove_outlier(col):
    sorted(col)
    Q1,Q3=np.percentile(col,[25,75])
    IQR=Q3-Q1
    lower_range= Q1-(1.5 * IQR)
    upper_range= Q3+(1.5 * IQR)
    return lower_range, upper_range

In [None]:
for column in df.columns:
    lr,ur=remove_outlier(df[column])
    df[column]=np.where(df[column]>ur,ur,df[column])
    df[column]=np.where(df[column]<lr,lr,df[column])

In [None]:
#### outliers are pulled towards the upper whisker and lower whisker
plt.figure(figsize =(15,10))
df.boxplot(vert =0)

### Linear Regression

In [None]:
x = df.drop("price", axis = 1)

In [None]:
y = df.pop("price")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
lrmodel = LinearRegression()

In [None]:
x_train, x_test , y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state = 1)

In [None]:
lrmodel.fit(x_train,y_train)

In [None]:
lrmodel.intercept_

In [None]:
lrmodel.coef_

In [None]:
for idx , col_name in enumerate (x_train.columns):
    print("The coefficent for {} is {}". format(col_name, lrmodel.coef_[idx]))

In [None]:
lrmodel.score(x_train,y_train)

In [None]:
lrmodel.score(x_test,y_test)

### Checking R squared value

In [None]:
import statsmodels.formula.api as smf

In [None]:
df_train = pd.concat([x_train, y_train], axis = 1)

In [None]:
df_train.head()

In [None]:
lm1 = smf.ols(formula = "price ~ carat+cut+color+clarity+depth+table+x+y+z", data = df_train).fit()

In [None]:
lm1.params

In [None]:
print(lm1.summary())