<a href="https://colab.research.google.com/github/sadrabr/machin-learning-projects/blob/main/preproccesing_usedcars.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from ast import increment_lineno
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns # type: ignore
%matplotlib inline

In [None]:
df = pd.read_csv('/content/drive/MyDrive/solo_projects/datasets/used_cars.csv')
df.head()
df.info()
print(df.shape)
df = df.drop_duplicates(keep='first')
df.shape
df.describe()
df.describe(exclude=[np.number])

In [None]:
df.columns
# df.dtypes


In [None]:
df.nunique()
df['owner'].value_counts().to_frame()
df['mileage'].value_counts().to_frame()
df['km_driven'].value_counts().to_frame()

In [None]:
df = df.drop(['mileage'],axis=1)
df.head()

# Handeling missing value

In [None]:
df.isnull().sum()

In [None]:
df_numeric_features = df.select_dtypes(include=['number'])
print(f"Numeric Features :\n{df_numeric_features}\n")

df_object_features= df.select_dtypes(include=['object'])
print(f"Object Features :\n{df_object_features}\n")

In [None]:
from sklearn.impute import SimpleImputer

imputer1 = SimpleImputer(strategy='most_frequent')
df_numeric_features = pd.DataFrame(imputer1.fit_transform(df_numeric_features), columns=df_numeric_features.columns)

df_object_features = pd.DataFrame(imputer1.fit_transform(df_object_features), columns=df_object_features.columns)
df_object_features.isnull().sum()

## Handling Outlier data

In [None]:
def find_outliers(column):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return column[(column < lower_bound) | ( upper_bound < column)]

for i, column in enumerate(df_numeric_features.columns):
    outliers = find_outliers(df_numeric_features[column])
    if(outliers.size>0):
      print(f"outliers - {column} :\n{outliers}")

In [None]:
import scipy.stats as stats

df_numeric_features['zscore_year'] = stats.zscore(df_numeric_features['year'])
df_numeric_features['zscore_selling_price'] = stats.zscore(df_numeric_features['selling_price'])
df_numeric_features.head()

In [None]:
# Normalization

In [None]:
df_numeric_features.reset_index(drop=True, inplace=True)
df_object_features.reset_index(drop=True, inplace=True)

print("null num", df_object_features.isnull().sum())
df = pd.concat([df_object_features,df_numeric_features], axis=1)


In [None]:
df=df[(-3< df['zscore_year']) & (df['zscore_year']<3) & (-3< df['zscore_selling_price']) & (df['zscore_selling_price']<3)]
df.head()

In [None]:
df = df.drop(['zscore_year','zscore_selling_price'],axis=1)

# convert datatype

In [None]:
df.dtypes


In [None]:
df['year'] = df['year'].astype(int)
df['selling_price'] = df['selling_price'].astype(int)
df['km_driven'] = df['km_driven'].astype(int)
df['seats'] = df['seats'].astype(int)
df.dtypes

# Transformation and column Distribution and Skew

In [None]:
# df['']

In [None]:
df.nunique()

# LabelEcoding

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
encoded_series1 = pd.Series(label_encoder.fit_transform(df['fuel']),name='fuel')
encoded_series1 = encoded_series1.reset_index(drop=True)
encoded_series2 = pd.Series(label_encoder.fit_transform(df['seller_type']),name='seller_type')
encoded_series2 = encoded_series2.reset_index(drop=True)
encoded_series3 = pd.Series(label_encoder.fit_transform(df['seats']),name='seats')
encoded_series3 = encoded_series3.reset_index(drop=True)
encoded_series4 = pd.Series(label_encoder.fit_transform(df['transmission']),name='transmission')
encoded_series4 = encoded_series4.reset_index(drop=True)
encoded_series5 = pd.Series(label_encoder.fit_transform(df['owner']),name='owner')
encoded_series5 = encoded_series5.reset_index(drop=True)
df.reset_index(drop=True,inplace=True)
df = pd.concat([df.drop(['fuel','seller_type','seats','transmission','owner'],axis=1),encoded_series1,encoded_series2,encoded_series3,encoded_series4,encoded_series5],axis=1)
df.head()

In [None]:
engines = []
for engine in df.engine:
    engine = engine.replace('CC','')
    engines.append(engine)
df.engine = engines

max_powers = []
for max_power in df.max_power:
    max_power = max_power.replace('bhp','')
    max_powers.append(max_power)
df.max_power = max_powers
df.head()

In [None]:
import math
df.engine = df.engine.astype(int)
df.max_power = df.max_power.astype(float)
max_powers = []
for max_power in df.max_power:
    max_power = math.floor(max_power)
    max_powers.append(max_power)
df.max_power = max_powers
df.dtypes

In [None]:
df = df.drop(['torque','name'],axis=1)
df.head(10)

In [None]:
# df.torque.value_counts().to_frame()

# Divide the data into training and test data

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop('selling_price', axis=1)
y = df['selling_price']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,train_size=0.8, random_state =0)
X_train.shape , X_test.shape , y_train.shape , y_test.shape

# single Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

X_train_single = X_train[['max_power']]
X_test_single = X_test[['max_power']]

slr_model = LinearRegression()
slr_model.fit(X_train_single, y_train)

y_pred_single = slr_model.predict(X_test_single)

mse_single = mean_squared_error(y_test, y_pred_single)
r2_single = r2_score(y_test, y_pred_single)

print(f"MSE: {mse_single}")
print(f"R2: {r2_single}")

# Multiple Linear Regression

In [None]:
mlr_model = LinearRegression()
mlr_model.fit(X_train, y_train)

y_pred_multi = mlr_model.predict(X_test)

mse_multi = mean_squared_error(y_test, y_pred_multi)
r2_multi = r2_score(y_test, y_pred_multi)

print(f"MSE: {mse_multi}")
print(f"R2: {r2_multi}")

# Polynomial Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

poly_features = PolynomialFeatures(degree=2)
poly_model = make_pipeline(poly_features, LinearRegression())

poly_model.fit(X_train, y_train)

y_pred_poly = poly_model.predict(X_test)

mse_poly = mean_squared_error(y_test, y_pred_poly)
r2_poly = r2_score(y_test, y_pred_poly)

print(f"Polynomial Regression (Degree {2}) - MSE: {mse_poly}, R2: {r2_poly}")

Polynomial Regression (Degree 2) - MSE: 15137590151.312517, R2: 0.8447682097690186


In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
from sklearn.linear_model import Ridge,Lasso,ElasticNet

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
model = ElasticNet()
param={
    'alpha':[1e-4,1e-3,1e-2,np.arange(0.1,1.1,0.1)],
    'l1_ratio':np.arange(0.1,0.9,0.1)
}
random_serch = RandomizedSearchCV(
    estimator=model,
    param_distributions = param,
    scoring = 'r2',
    cv = 5,
    n_iter = 3,
    verbose = 3
)
random_serch.fit(X_train,y_train)
print("random search : \n================================")
print(f"best parameters :{random_serch.best_params_}")
print(f"best r2_score :{random_serch.best_score_}")

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5] END ..........alpha=0.01, l1_ratio=0.2;, score=0.738 total time=   0.0s
[CV 2/5] END ..........alpha=0.01, l1_ratio=0.2;, score=0.723 total time=   0.0s
[CV 3/5] END ..........alpha=0.01, l1_ratio=0.2;, score=0.694 total time=   0.0s
[CV 4/5] END ..........alpha=0.01, l1_ratio=0.2;, score=0.678 total time=   0.0s
[CV 5/5] END ..........alpha=0.01, l1_ratio=0.2;, score=0.717 total time=   0.0s
[CV 1/5] END alpha=0.001, l1_ratio=0.30000000000000004;, score=0.738 total time=   0.0s
[CV 2/5] END alpha=0.001, l1_ratio=0.30000000000000004;, score=0.723 total time=   0.0s
[CV 3/5] END alpha=0.001, l1_ratio=0.30000000000000004;, score=0.693 total time=   0.0s
[CV 4/5] END alpha=0.001, l1_ratio=0.30000000000000004;, score=0.679 total time=   0.0s
[CV 5/5] END alpha=0.001, l1_ratio=0.30000000000000004;, score=0.718 total time=   0.0s
[CV 1/5] END ........alpha=0.0001, l1_ratio=0.2;, score=0.738 total time=   0.0s
[CV 2/5] END .

In [None]:
model = Lasso()
lasso_parameters = {
 'alpha': np.arange(0.00, 1.0, 0.01)
 }
random_serch = RandomizedSearchCV(
    estimator=model,
    param_distributions = lasso_parameters,
    scoring = 'r2',
    cv = 5,
    n_iter = 3,
    verbose = 3
)
random_serch.fit(X_train,y_train)
print("random search : \n================================")
print(f"best parameters :{random_serch.best_params_}")
print(f"best r2_score :{random_serch.best_score_}")

In [None]:
model = Ridge()
lasso_parameters = {
 'alpha': np.arange(0.00, 1.0, 0.01),
 'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']
 }
random_serch = RandomizedSearchCV(
    estimator=model,
    param_distributions = lasso_parameters,
    scoring = 'r2',
    cv = 5,
    n_iter = 3,
    verbose = 3
)
random_serch.fit(X_train,y_train)
print("random search : \n================================")
print(f"best parameters :{random_serch.best_params_}")
print(f"best r2_score :{random_serch.best_score_}")

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5] END ...........alpha=0.88, solver=auto;, score=0.738 total time=   0.0s
[CV 2/5] END ...........alpha=0.88, solver=auto;, score=0.723 total time=   0.0s
[CV 3/5] END ...........alpha=0.88, solver=auto;, score=0.693 total time=   0.0s
[CV 4/5] END ...........alpha=0.88, solver=auto;, score=0.679 total time=   0.0s
[CV 5/5] END ...........alpha=0.88, solver=auto;, score=0.718 total time=   0.0s
[CV 1/5] END ............alpha=0.33, solver=svd;, score=0.738 total time=   0.0s
[CV 2/5] END ............alpha=0.33, solver=svd;, score=0.723 total time=   0.0s
[CV 3/5] END ............alpha=0.33, solver=svd;, score=0.693 total time=   0.0s
[CV 4/5] END ............alpha=0.33, solver=svd;, score=0.679 total time=   0.0s
[CV 5/5] END ............alpha=0.33, solver=svd;, score=0.718 total time=   0.0s
[CV 1/5] END ...........alpha=0.34, solver=saga;, score=0.052 total time=   0.4s
[CV 2/5] END ...........alpha=0.34, solver=saga;,