## Problem Statement

## Import All Libraries Required For This Projects

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import json
import os
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.metrics import r2_score,mean_absolute_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler,StandardScaler


from sklearn.linear_model import LinearRegression,SGDRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

from xgboost import XGBRegressor
# from lightgbm import LGBMRegressor
# from catboost import CatBoostRegressor

import scipy.stats as sm

ModuleNotFoundError: No module named 'lightgbm'

## Load Dataset

In [None]:
df = pd.read_csv('abalone.csv')
df

## About Features

### Input Features Handling

### Target Feature

## Data Analysis:EDA

In [None]:
df.info()

In [None]:
df.columns = ['Sex', 'Length', 'Diameter', 'Height', 'Whole_weight', 'Shucked_weight',
       'Viscera_weight', 'Shell_weight', 'Rings']
df.columns

## Univariate Analysis

### Sex

In [None]:
df['Sex'].unique()   # Male, Female, Infant

In [None]:
df['Sex'].value_counts()

In [None]:
sns.countplot(x=df['Sex'])
plt.title("Sex Distributions",fontsize=14)
plt.xlabel('Sex',fontsize=12)
plt.ylabel('Number Of Counts',fontsize=12)
plt.show()

In [None]:
# Pie Plot
df['Sex'].value_counts().plot(kind='pie',autopct='%.1f%%')
plt.legend()
plt.show()

## Statistics of Numerical Features

In [None]:
d = pd.DataFrame(columns=['Minimum','Maximum','Mean','Median','Mode','Standard_Deviation','Variance','25%_Quantile','75%_Quantile'],
                index=['Length'])


for col in df.columns:
    if df[col].dtype==object:
        pass
    else:
        min1 = df[col].min()
        max1 = df[col].max()
        mean = df[col].mean()
        median = df[col].mean()
        mode = df[col].mean()
        std = df[col].std()
        var = df[col].var()
        q_25 = df[col].quantile(0.25)
        q_75 = df[col].quantile(0.75)
        d.loc[col] = [min1,max1,mean,median, mode, std, var, q_25,q_75]

In [None]:
d

### Length

In [None]:
df['Length'].describe()

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
sns.kdeplot(df['Length'])    # Longest shell measurement
plt.title("Kde Plot Length",fontsize=14)
plt.xlabel('Length',fontsize=12)
plt.ylabel('Number Of Counts',fontsize=12)
plt.subplot(1,2,2)
sm.probplot(df['Length'],dist='norm',plot=plt)
plt.show()

In [None]:
sns.histplot(x=df['Length'],color='Violet')
plt.title("Histogram of Length")
plt.show()

In [None]:
plt.hist(df['Length'],color='Violet')
plt.title("Histograms of Length Distributions")
plt.show()

### Diameter

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
sns.kdeplot(df['Diameter'])    
plt.title("Kde Plot Diameter",fontsize=14)
plt.xlabel('Diameter',fontsize=12)
plt.ylabel('Number Of Counts',fontsize=12)
plt.subplot(1,2,2)
sm.probplot(df['Diameter'],dist='norm',plot=plt)
plt.show()

In [None]:
sns.histplot(x=df['Diameter'],color='Purple')
plt.title("Histogram of Diameter")
plt.show()

In [None]:
plt.hist(df['Diameter'],color='Purple')
plt.title("Histograms of Diameter Distributions")
plt.show()

### Height

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
sns.kdeplot(df['Height'])    
plt.title("Kde Plot Height",fontsize=14)
plt.xlabel('Height',fontsize=12)
plt.ylabel('Number Of Counts',fontsize=12)
plt.subplot(1,2,2)
sm.probplot(df['Height'],dist='norm',plot=plt)
plt.show()

In [None]:
sns.histplot(x=df['Height'],color='Cyan')
plt.title("Histogram of Height")
plt.show()

In [None]:
plt.hist(df['Height'],color='Cyan')
plt.title("Histograms of Height Distributions")
plt.show()

### Whole_weight

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
sns.kdeplot(df['Whole_weight'])    
plt.title("Kde Plot Whole_weight",fontsize=14)
plt.xlabel('Whole_weight',fontsize=12)
plt.ylabel('Number Of Counts',fontsize=12)
plt.subplot(1,2,2)
sm.probplot(df['Whole_weight'],dist='norm',plot=plt)
plt.show()

In [None]:
sns.histplot(x=df['Whole_weight'],color='Orange')
plt.title("Histogram of Whole_weight")
plt.show()

In [None]:
plt.hist(df['Whole_weight'],color='Orange')
plt.title("Histograms of Whole_weight Distributions")
plt.show()

### Shucked_weight

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
sns.kdeplot(df['Shucked_weight'])    
plt.title("Kde Plot Shucked_weight",fontsize=14)
plt.xlabel('Shucked_weight',fontsize=12)
plt.ylabel('Number Of Counts',fontsize=12)
plt.subplot(1,2,2)
sm.probplot(df['Shucked_weight'],dist='norm',plot=plt)
plt.show()

In [None]:
sns.histplot(x=df['Shucked_weight'],color='Red')
plt.title("Histogram of Shucked_weight")
plt.show()

In [None]:
plt.hist(df['Shucked_weight'],color='Red')
plt.title("Histograms of Shucked_weight Distributions")
plt.show()

### Viscera_weight

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
sns.kdeplot(df['Viscera_weight'])    
plt.title("Kde Plot Viscera_weight",fontsize=14)
plt.xlabel('Viscera_weight',fontsize=12)
plt.ylabel('Number Of Counts',fontsize=12)
plt.subplot(1,2,2)
sm.probplot(df['Viscera_weight'],dist='norm',plot=plt)
plt.show()

In [None]:
sns.histplot(x=df['Viscera_weight'],color='Green')
plt.title("Histogram of Viscera_weight")
plt.show()

In [None]:
plt.hist(df['Viscera_weight'],color='Green')
plt.title("Histograms of Viscera_weight Distributions")
plt.show()

### Shell_weight

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
sns.kdeplot(df['Shell_weight'])    
plt.title("Kde Plot Shell_weight",fontsize=14)
plt.xlabel('Shell_weight',fontsize=12)
plt.ylabel('Number Of Counts',fontsize=12)
plt.subplot(1,2,2)
sm.probplot(df['Shell_weight'],dist='norm',plot=plt)
plt.show()

In [None]:
sns.histplot(x=df['Shell_weight'],color='Yellow')
plt.title("Histogram of Shell_weight")
plt.show()

In [None]:
plt.hist(df['Shell_weight'],color='Yellow')
plt.title("Histograms of Shell_weight Distributions")
plt.show()

### Target Feature Rings

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
sns.kdeplot(df['Rings'])    
plt.title("Kde Plot Rings",fontsize=14)
plt.xlabel('Rings',fontsize=12)
plt.ylabel('Number Of Counts',fontsize=12)
plt.subplot(1,2,2)
sm.probplot(df['Rings'],dist='norm',plot=plt)
plt.show()

In [None]:
sns.histplot(x=df['Rings'],color='Black')
plt.title("Histogram of Rings")
plt.show()

In [None]:
plt.hist(df['Rings'],color='Black')
plt.title("Histograms of Rings Distributions")
plt.show()

In [None]:
df['Rings'].describe()

In [None]:
df['Rings'].unique()

## Bivariate Analysis

In [None]:
sns.scatterplot(x=df['Length'],y=df['Rings'])
plt.title("Length VS Rings Relationship",fontsize=12)
plt.xlabel('Length',fontsize=12)
plt.ylabel('Rings',fontsize=12)
plt.show()

In [None]:
sns.scatterplot(x=df['Diameter'],y=df['Rings'],color='Cyan')
plt.title("Diameter VS Rings Relationship",fontsize=12)
plt.xlabel('Diameter',fontsize=12)
plt.ylabel('Rings',fontsize=12)
plt.show()

In [None]:
sns.scatterplot(x=df['Height'],y=df['Rings'],color='Green')
plt.title("Height VS Rings Relationship",fontsize=12)
plt.xlabel('Height',fontsize=12)
plt.ylabel('Rings',fontsize=12)
plt.show()

In [None]:
sns.scatterplot(x=df['Whole_weight'],y=df['Rings'],color='Red')
plt.title("Whole_weight VS Rings Relationship",fontsize=12)
plt.xlabel('Whole_weight',fontsize=12)
plt.ylabel('Rings',fontsize=12)
plt.show()

In [None]:
sns.scatterplot(x=df['Shucked_weight'],y=df['Rings'],color='Pink')
plt.title("Shucked_weight VS Rings Relationship",fontsize=12)
plt.xlabel('Shucked_weight',fontsize=12)
plt.ylabel('Rings',fontsize=12)
plt.show()

In [None]:
sns.scatterplot(x=df['Viscera_weight'],y=df['Rings'],color='Blue')
plt.title("Viscera_weight VS Rings Relationship",fontsize=12)
plt.xlabel('Viscera_weight',fontsize=12)
plt.ylabel('Rings',fontsize=12)
plt.show()

In [None]:
sns.scatterplot(x=df['Shell_weight'],y=df['Rings'],color='Yellow')
plt.title("Shell_weight VS Rings Relationship",fontsize=12)
plt.xlabel('Shell_weight',fontsize=12)
plt.ylabel('Rings',fontsize=12)
plt.show()

## Handling Categorical Data

In [None]:
df['Sex'].unique()

In [None]:
df['Sex'] = df['Sex'].replace({'F':0,'M':1,'I':2})

In [None]:
df['Sex'].unique()

## Multivariate Analysis

In [None]:
df.head()

In [None]:
cor = df.corr()
cor

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(cor, annot=True)
plt.show()

In [None]:
sns.pairplot(df);

## Features Engineering

In [None]:
df.head()

## Checking Skewness

In [None]:
plt.figure(figsize=(12,16))
i=1
for col in df.columns:
    plt.subplot(4,3,i)
    sns.kdeplot(df[col])
    i+=1

In [None]:
df.skew()

## Detecting Outliers

In [None]:
plt.figure(figsize=(12,16))
i=1
for col in df.columns:
    plt.subplot(4,3,i)
    df[[col]].boxplot()
    i+=1

## Split The Data

In [None]:
x = df.drop('Rings',axis=1)
y = df['Rings']

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.02,shuffle=True)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
x_train.head()

In [None]:
y_train

## Scale The Data

In [None]:
scaler = MinMaxScaler()
scaler.fit(x_train)

In [None]:
x_train_scaled = pd.DataFrame(scaler.transform(x_train),columns=x_train.columns)
x_test_scaled = pd.DataFrame(scaler.transform(x_test),columns=x_test.columns)
x_train_scaled

In [None]:
x_test_scaled

## Build Model

## LinearRegression

In [None]:
model = LinearRegression()
model.fit(x_train_scaled, y_train)

In [None]:
y_pred = model.predict(x_test_scaled)
y_pred = np.around(y_pred,2)
y_pred

In [None]:
y_test.values

In [None]:
print('----Model Evaluation On Test Data----')
print()
print(f"MAE : {mean_absolute_error(y_test,y_pred)}")
print(f"MSE: {mean_squared_error(y_test,y_pred)}")
print(f"R2 Score : {r2_score(y_test,y_pred)}")

print('-----------------------------------------------')
## For Train Data
y_pred_tr = model.predict(x_train_scaled)

print('----Model Evaluation On Train Data----')
print()
print(f"MAE : {mean_absolute_error(y_train,y_pred_tr)}")
print(f"MSE: {mean_squared_error(y_train,y_pred_tr)}")
print(f"R2 Score : {r2_score(y_train,y_pred_tr)}")

## RandomForestRegressor

In [None]:
rf_model = RandomForestRegressor()
rf_model.fit(x_train, y_train)

In [None]:
y_pred = rf_model.predict(x_test)
y_pred = np.around(y_pred,2)
y_pred

In [None]:
y_test.values

In [None]:
print('----Model Evaluation On Test Data----')
print()
print(f"MAE : {mean_absolute_error(y_test,y_pred)}")
print(f"MSE: {mean_squared_error(y_test,y_pred)}")
print(f"R2 Score : {r2_score(y_test,y_pred)}")

print('-----------------------------------------------')
## For Train Data
y_pred_tr = rf_model.predict(x_train)

print('----Model Evaluation On Train Data----')
print()
print(f"MAE : {mean_absolute_error(y_train,y_pred_tr)}")
print(f"MSE: {mean_squared_error(y_train,y_pred_tr)}")
print(f"R2 Score : {r2_score(y_train,y_pred_tr)}")

## Hyperparameter Tunning Random Forest Regression

In [None]:
rf_model = RandomForestRegressor()

param_grid = {'n_estimators':np.arange(50,200),
    'criterion':["squared_error", "absolute_error", "friedman_mse", "poisson"],
    'max_depth':np.arange(2,25),
    'min_samples_split':np.arange(2,25),
    'min_samples_leaf':np.arange(2,25)}

rscv_rf_model = RandomizedSearchCV(rf_model,param_grid, cv=5)
rscv_rf_model.fit(x_train,y_train)
rscv_rf_model.best_estimator_

In [None]:
new_rf_model = rscv_rf_model.best_estimator_

y_pred = new_rf_model.predict(x_test)
y_pred = np.around(y_pred,2)
y_pred

In [None]:
y_test.values

In [None]:
print('----Model Evaluation On Test Data----')
print()
print(f"MAE : {mean_absolute_error(y_test,y_pred)}")
print(f"MSE: {mean_squared_error(y_test,y_pred)}")
print(f"R2 Score : {r2_score(y_test,y_pred)}")

print('-----------------------------------------------')
## For Train Data
y_pred_tr = new_rf_model.predict(x_train)

print('----Model Evaluation On Train Data----')
print()
print(f"MAE : {mean_absolute_error(y_train,y_pred_tr)}")
print(f"MSE: {mean_squared_error(y_train,y_pred_tr)}")
print(f"R2 Score : {r2_score(y_train,y_pred_tr)}")

In [None]:
new_rf_model.feature_importances_

In [None]:
features_imp = pd.Series(new_rf_model.feature_importances_,index=x_train.columns)
features_imp

In [None]:
features_imp.plot(kind='barh')
plt.title('Features Importance Distributions',fontsize=14)
plt.show()

In [None]:
xgb_model = XGBRegressor()
xgb_model.fit(x_train,y_train)

In [None]:
y_pred = xgb_model.predict(x_test)
y_pred = np.around(y_pred,2)
y_pred

In [None]:
print('----Model Evaluation On Test Data----')
print()
print(f"MAE : {mean_absolute_error(y_test,y_pred)}")
print(f"MSE: {mean_squared_error(y_test,y_pred)}")
print(f"R2 Score : {r2_score(y_test,y_pred)}")

print('-----------------------------------------------')
## For Train Data
y_pred_tr = xgb_model.predict(x_train)

print('----Model Evaluation On Train Data----')
print()
print(f"MAE : {mean_absolute_error(y_train,y_pred_tr)}")
print(f"MSE: {mean_squared_error(y_train,y_pred_tr)}")
print(f"R2 Score : {r2_score(y_train,y_pred_tr)}")

## Save Model

In [None]:
new_rf_model

In [None]:
scaler

In [None]:
with open('rf_model.pickle','wb') as file:
    pickle.dump(new_rf_model,file)
    
with open('scaler.pickle','wb') as file:
    pickle.dump(scaler,file)

In [None]:
features_dict = {'columns':list(x_train.columns),'Sex':{'Female':0,'Male':1,'Infant':2}}
features_dict

In [None]:
len(features_dict['columns'])

In [None]:
features_dict['Sex']['Female']

In [None]:
with open('features_data.json','w') as file:
    json.dump(features_dict,file)

In [None]:
df.head()

## Taking User Inputs

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
input_data = [1,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.15]
print('Input Data:',input_data)
print()
input_data_scaled = scaler.transform([input_data])
print("Scaled User Data:",input_data_scaled)
print()
pred = new_rf_model.predict(input_data_scaled)[0]
print(f"Predicted Age is: {pred:.2f} Yr")