In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error

In [2]:
data = pd.read_csv('CS_08_20.csv')

#1 Creating DataFrame
df = pd.DataFrame(data)

#2 Handling Categorical Data
# Encoding 'Car Maker' and 'Segment of car' (One-Hot Encoding for 'Make', Label Encoding for 'Segment')

df = pd.get_dummies(df, columns=['Make'], prefix='Maker')
label_encoder = LabelEncoder()

df['Segment_LabelEncoded'] = label_encoder.fit_transform(df['Segment'])

df

Unnamed: 0,Car_name,BP_USD,MP_USD,FP_USD,Segment,Min_CC,Max_CC,Single_CC,Min_HP,Max_HP,...,Maker_Scion,Maker_Smart,Maker_Subaru,Maker_Subaru/GM,Maker_Suzuki,Maker_Tesla,Maker_Toyota,Maker_Volkswagen,Maker_Volvo,Segment_LabelEncoded
0,Volkswagen Jetta,22500.0,26000.0,30000.0,B1,1600.00,2000.00,,150.0,228.0,...,0,0,0,0,0,0,0,1,0,1
1,Ford Focus,20000.0,23000.0,27000.0,B1,1600.00,2000.00,,120.0,252.0,...,0,0,0,0,0,0,0,0,0,1
2,Hyundai Elantra,20500.0,23000.0,28000.0,B1,1600.00,2000.00,,120.0,276.0,...,0,0,0,0,0,0,0,0,0,1
3,Audi TT,40000.0,45000.0,55000.0,D2,1800.00,2000.00,,160.0,280.0,...,0,0,0,0,0,0,0,0,0,6
4,Audi A3,30000.0,35000.0,40000.0,B1,1800.00,2000.00,,150.0,300.0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
550,Volkswagen,36000.0,42000.0,50000.0,C1,2000.00,2500.00,0.0,200.0,280.0,...,0,0,0,0,0,0,0,1,0,3
551,Chrysler 200,22000.0,25000.0,30000.0,C1,2400.00,2400.00,1.0,184.0,184.0,...,0,0,0,0,0,0,0,0,0,3
552,Cadillac CT5,45000.0,50000.0,60000.0,C1,2000.00,3600.00,0.0,237.0,335.0,...,0,0,0,0,0,0,0,0,0,3
553,Jeep Patriot,20000.0,25000.0,30000.0,C2,2400.00,2400.00,1.0,172.0,172.0,...,0,0,0,0,0,0,0,0,0,4


In [3]:
df.columns

Index(['Car_name', 'BP_USD', 'MP_USD', 'FP_USD', 'Segment', 'Min_CC', 'Max_CC',
       'Single_CC', 'Min_HP', 'Max_HP', 'No_add_prf', 'FE_KML_MIN',
       'FE_KML_MAX', 'TCO_Min', 'TCO_Max', 'YTD', 'YTD LY', 'CHANGE %', 'Year',
       'FED_INT_CHG', 'UNMP_CHG', 'GDP_CHG', 'UNMP_RATE', 'Maker_Acura',
       'Maker_Alfa Romeo', 'Maker_Audi', 'Maker_BMW', 'Maker_Buick',
       'Maker_Cadillac', 'Maker_Chevrolet', 'Maker_Chrysler', 'Maker_Dodge',
       'Maker_Fiat', 'Maker_Ford', 'Maker_GMC', 'Maker_Genesis', 'Maker_Honda',
       'Maker_Hummer', 'Maker_Hyundai', 'Maker_Infiniti', 'Maker_Isuzu',
       'Maker_Jaguar', 'Maker_Jeep', 'Maker_Kia', 'Maker_Land Rover',
       'Maker_Lexus', 'Maker_Lincoln', 'Maker_Mazda', 'Maker_Mercedes-Benz',
       'Maker_Mercury', 'Maker_Mini', 'Maker_Mitsubishi', 'Maker_Nissan',
       'Maker_Pontiac', 'Maker_Porsche', 'Maker_Ram', 'Maker_Saab',
       'Maker_Saturn', 'Maker_Scion', 'Maker_Smart', 'Maker_Subaru',
       'Maker_Subaru/GM', 'Maker_Suzuki', 

In [4]:
# Drop original 'Car Name' and 'Segment of car' since they are no longer needed for modeling

# df_dropped = df.drop(columns=['Car_name', 'Segment', 'YTD LY', 'CHANGE %' , 'Year'], inplace=True)
df_dropped = df.drop(columns=['Car_name', 'Segment', 'CHANGE %' , 'Year'], inplace=True)


Model Improved Drastically when Year to Date Sales numbers from Last is included as a feature.

This makes the model fitting and prediction lot easier

# Removing Electic Vehicles from the list

In [5]:
# The value to search for
value_to_find = 'Electric'

# Find rows containing the value
rows_to_drop = df[df.isin([value_to_find]).any(axis=1)].index

# Drop the rows containing the value
df_dropped = df.drop(rows_to_drop)

# Show the modified DataFrame
df_dropped

Unnamed: 0,BP_USD,MP_USD,FP_USD,Min_CC,Max_CC,Single_CC,Min_HP,Max_HP,No_add_prf,FE_KML_MIN,...,Maker_Scion,Maker_Smart,Maker_Subaru,Maker_Subaru/GM,Maker_Suzuki,Maker_Tesla,Maker_Toyota,Maker_Volkswagen,Maker_Volvo,Segment_LabelEncoded
0,22500.0,26000.0,30000.0,1600.00,2000.00,,150.0,228.0,,15.0,...,0,0,0,0,0,0,0,1,0,1
1,20000.0,23000.0,27000.0,1600.00,2000.00,,120.0,252.0,,15.0,...,0,0,0,0,0,0,0,0,0,1
2,20500.0,23000.0,28000.0,1600.00,2000.00,,120.0,276.0,,14.0,...,0,0,0,0,0,0,0,0,0,1
3,40000.0,45000.0,55000.0,1800.00,2000.00,,160.0,280.0,,9.0,...,0,0,0,0,0,0,0,0,0,6
4,30000.0,35000.0,40000.0,1800.00,2000.00,,150.0,300.0,,12.0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
550,36000.0,42000.0,50000.0,2000.00,2500.00,0.0,200.0,280.0,0.0,8.0,...,0,0,0,0,0,0,0,1,0,3
551,22000.0,25000.0,30000.0,2400.00,2400.00,1.0,184.0,184.0,1.0,10.0,...,0,0,0,0,0,0,0,0,0,3
552,45000.0,50000.0,60000.0,2000.00,3600.00,0.0,237.0,335.0,0.0,7.0,...,0,0,0,0,0,0,0,0,0,3
553,20000.0,25000.0,30000.0,2400.00,2400.00,1.0,172.0,172.0,1.0,10.0,...,0,0,0,0,0,0,0,0,0,4


In [6]:
# # The value to search for
# value_to_find = 'Electric'

# # Find columns containing the value
# columns_to_drop = [col for col in df.columns if value_to_find in df[col].values]
# # Drop the columns containing the value
# df_dropped_1 = df.drop(columns=columns_to_drop)

# # Show the modified DataFrame
# df_dropped_1

In [7]:
df_dropped.shape

(538, 64)

###### Filling NAN values with 0

In [8]:
df_filled = df_dropped.fillna(0)  # Replace all NaN values with 0
df_filled

Unnamed: 0,BP_USD,MP_USD,FP_USD,Min_CC,Max_CC,Single_CC,Min_HP,Max_HP,No_add_prf,FE_KML_MIN,...,Maker_Scion,Maker_Smart,Maker_Subaru,Maker_Subaru/GM,Maker_Suzuki,Maker_Tesla,Maker_Toyota,Maker_Volkswagen,Maker_Volvo,Segment_LabelEncoded
0,22500.0,26000.0,30000.0,1600.00,2000.00,0.0,150.0,228.0,0.0,15.0,...,0,0,0,0,0,0,0,1,0,1
1,20000.0,23000.0,27000.0,1600.00,2000.00,0.0,120.0,252.0,0.0,15.0,...,0,0,0,0,0,0,0,0,0,1
2,20500.0,23000.0,28000.0,1600.00,2000.00,0.0,120.0,276.0,0.0,14.0,...,0,0,0,0,0,0,0,0,0,1
3,40000.0,45000.0,55000.0,1800.00,2000.00,0.0,160.0,280.0,0.0,9.0,...,0,0,0,0,0,0,0,0,0,6
4,30000.0,35000.0,40000.0,1800.00,2000.00,0.0,150.0,300.0,0.0,12.0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
550,36000.0,42000.0,50000.0,2000.00,2500.00,0.0,200.0,280.0,0.0,8.0,...,0,0,0,0,0,0,0,1,0,3
551,22000.0,25000.0,30000.0,2400.00,2400.00,1.0,184.0,184.0,1.0,10.0,...,0,0,0,0,0,0,0,0,0,3
552,45000.0,50000.0,60000.0,2000.00,3600.00,0.0,237.0,335.0,0.0,7.0,...,0,0,0,0,0,0,0,0,0,3
553,20000.0,25000.0,30000.0,2400.00,2400.00,1.0,172.0,172.0,1.0,10.0,...,0,0,0,0,0,0,0,0,0,4


In [9]:
# df_filled = df_dropped
# Assuming 'target_column' is your output variable and all other columns are features
col_x = df_filled.drop(columns=['YTD'])  # Features (X)  #'Car_name','Make','Segment'
col_y = df_filled['YTD']  # Target (y)

# Check the shapes of train_X and train_y
# df_filled = col_X.fillna(0)

print(col_x.shape, col_y.shape)

(538, 63) (538,)


In [10]:
col_y

0       97461.0
1      195823.0
2       94720.0
3        4486.0
4        4759.0
         ...   
550        58.0
551        48.0
552        43.0
553        27.0
554        16.0
Name: YTD, Length: 538, dtype: float64

###### CHANGEING DATA TYPE TO FLOAT ONCE ELECTRIC IS REMOVED

In [11]:
df_filled['Min_CC'] = df_filled['Min_CC'].astype(float)
df_filled['Max_CC'] = df_filled['Max_CC'].astype(float)

# Check the result
df_filled.dtypes


BP_USD                  float64
MP_USD                  float64
FP_USD                  float64
Min_CC                  float64
Max_CC                  float64
                         ...   
Maker_Tesla               uint8
Maker_Toyota              uint8
Maker_Volkswagen          uint8
Maker_Volvo               uint8
Segment_LabelEncoded      int64
Length: 64, dtype: object

In [12]:
import numpy as np
from sklearn.model_selection import train_test_split

# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(col_x, col_y, test_size=0.4, random_state=42)
X_train.head()

Unnamed: 0,BP_USD,MP_USD,FP_USD,Min_CC,Max_CC,Single_CC,Min_HP,Max_HP,No_add_prf,FE_KML_MIN,...,Maker_Scion,Maker_Smart,Maker_Subaru,Maker_Subaru/GM,Maker_Suzuki,Maker_Tesla,Maker_Toyota,Maker_Volkswagen,Maker_Volvo,Segment_LabelEncoded
405,40000.0,45000.0,55000.0,2000.0,2500.0,0.0,245.0,295.0,0.0,8.0,...,0,0,0,0,0,0,0,0,0,4
417,17000.0,20000.0,25000.0,1000.0,1500.0,0.0,68.0,106.0,0.0,16.0,...,0,0,0,0,0,0,1,0,0,1
234,23000.0,27000.0,30000.0,3400.0,3400.0,1.0,303.0,303.0,1.0,8.0,...,0,0,0,0,0,0,0,0,0,3
255,25000.0,30000.0,35000.0,4200.0,4200.0,1.0,285.0,285.0,1.0,8.0,...,0,0,0,0,0,0,0,0,0,5
400,40000.0,45000.0,55000.0,2400.0,3500.0,0.0,200.0,355.0,0.0,8.0,...,0,0,0,0,0,0,0,0,0,3


In [13]:
X_train['Min_CC'] = X_train['Min_CC'].astype(float)
X_train['Max_CC'] = X_train['Max_CC'].astype(float)

# Check the result
X_train.dtypes

X_test['Min_CC'] = X_test['Min_CC'].astype(float)
X_test['Max_CC'] = X_test['Max_CC'].astype(float)

# Check the result
X_test.dtypes

BP_USD                  float64
MP_USD                  float64
FP_USD                  float64
Min_CC                  float64
Max_CC                  float64
                         ...   
Maker_Tesla               uint8
Maker_Toyota              uint8
Maker_Volkswagen          uint8
Maker_Volvo               uint8
Segment_LabelEncoded      int64
Length: 63, dtype: object

In [14]:
# from sklearn.neighbors import KNeighborsClassifier

# knn = KNeighborsClassifier(n_neighbors=2)
# knn.fit(X_train,y_train)

# knn.score(X_train,y_train)
# #knn.score(X_test,y_test)
# #knn.score(train_X_contig,y_test)

# Linear Regression

In [15]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 322 entries, 405 to 102
Data columns (total 63 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   BP_USD                322 non-null    float64
 1   MP_USD                322 non-null    float64
 2   FP_USD                322 non-null    float64
 3   Min_CC                322 non-null    float64
 4   Max_CC                322 non-null    float64
 5   Single_CC             322 non-null    float64
 6   Min_HP                322 non-null    float64
 7   Max_HP                322 non-null    float64
 8   No_add_prf            322 non-null    float64
 9   FE_KML_MIN            322 non-null    float64
 10  FE_KML_MAX            322 non-null    float64
 11  TCO_Min               322 non-null    float64
 12  TCO_Max               322 non-null    float64
 13  YTD LY                322 non-null    float64
 14  FED_INT_CHG           322 non-null    float64
 15  UNMP_CHG             

In [16]:
# Get row and column indices with NaNs
nan_rows, nan_cols = np.where(df.isna())

# Display the results
list(zip(df.columns[nan_cols], nan_rows))


[('Single_CC', 0),
 ('No_add_prf', 0),
 ('Single_CC', 1),
 ('No_add_prf', 1),
 ('Single_CC', 2),
 ('No_add_prf', 2),
 ('Single_CC', 3),
 ('No_add_prf', 3),
 ('Single_CC', 4),
 ('No_add_prf', 4),
 ('Single_CC', 5),
 ('No_add_prf', 5),
 ('Single_CC', 6),
 ('No_add_prf', 6),
 ('Single_CC', 7),
 ('No_add_prf', 7),
 ('Single_CC', 8),
 ('No_add_prf', 8),
 ('Single_CC', 9),
 ('No_add_prf', 9),
 ('Single_CC', 10),
 ('No_add_prf', 10),
 ('Single_CC', 11),
 ('No_add_prf', 11),
 ('Single_CC', 12),
 ('No_add_prf', 12),
 ('Single_CC', 13),
 ('No_add_prf', 13),
 ('Single_CC', 14),
 ('No_add_prf', 14),
 ('Single_CC', 15),
 ('No_add_prf', 15),
 ('Single_CC', 16),
 ('No_add_prf', 16),
 ('Single_CC', 17),
 ('No_add_prf', 17),
 ('Single_CC', 18),
 ('Single_CC', 19),
 ('Single_CC', 20),
 ('No_add_prf', 20),
 ('Single_CC', 21),
 ('No_add_prf', 21),
 ('Single_CC', 22),
 ('No_add_prf', 22),
 ('Single_CC', 23),
 ('No_add_prf', 23),
 ('Single_CC', 24),
 ('No_add_prf', 24),
 ('Single_CC', 25),
 ('No_add_prf', 2

# Linear Regression

In [17]:
from sklearn.linear_model import LinearRegression
reg= LinearRegression().fit(X_train,y_train)
reg.score(X_train,y_train) #(X_test, y_test)

0.9745335320127372

In [18]:
reg.score(X_test, y_test)

0.917003416922172

In [20]:
# Step 3: Make predictions on the test data
y_pred = reg.predict(X_test)

# Step 4: Calculate the Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Display the result
print(f"Mean Squared Error (MSE): {mse}")

Mean Squared Error (MSE): 495067802.3701909


## Ridge Regression

In [None]:
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=2, max_iter=100, tol = 0.1)
ridge_reg.fit(X_train,y_train)
ridge_reg.score(X_train, y_train)
#ridge_reg.score(X_test, y_test)

In [None]:
from sklearn import linear_model
lasso_reg = linear_model.Lasso(alpha=2, max_iter=500, tol = 0.1)
lasso_reg.fit(X_train,y_train)
#lasso_reg.score(X_train,y_train)
#lasso_reg.score(X_test, y_test)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge, Lasso

# Define a parameter grid for alpha (regularization strength)
alphas = [0.001, 0.01, 0.1, 1, 10, 100]

# Ridge regression grid search
ridge = Ridge()
ridge_grid = GridSearchCV(ridge, param_grid={'alpha': alphas}, cv=5)
ridge_grid.fit(X_train, y_train)
print(f"Best Ridge alpha: {ridge_grid.best_params_['alpha']}")

# Lasso regression grid search
lasso = Lasso()
lasso_grid = GridSearchCV(lasso, param_grid={'alpha': alphas}, cv=5)
lasso_grid.fit(X_train, y_train)
print(f"Best Lasso alpha: {lasso_grid.best_params_['alpha']}")

###### Best Alpha suggested as 100 for both RIDGE AND LASSO

## Ridge Regression

In [21]:
ridge_reg = Ridge(alpha=100, max_iter=100, tol = 0.1)
ridge_reg.fit(X_train,y_train)
ridge_reg.score(X_train, y_train)
#ridge_reg.score(X_test, y_test)

0.9681325640574181

In [22]:
ridge_reg.score(X_test, y_test)

0.9412170706269061

## Lasso Regression 

In [23]:
# lasso_reg = Lasso(alpha=0.001, max_iter=500, tol = 0.1)

lasso_reg = Lasso(alpha=100, max_iter=500, tol = 0.1)
lasso_reg.fit(X_train,y_train)
lasso_reg.score(X_train, y_train)
#lasso_reg.score(X_test, y_test)

0.9666698146645598

In [24]:
lasso_reg.score(X_test, y_test)

0.9263749855589399

Conclusion:
Linear Regression is outperforming Lasso and Ridge, it likely indicates that:

#data doesn’t have multicollinearity or overfitting issues.

#The regularization strength in Lasso/Ridge is too high, leading to underfitting.

#The dataset is simple and clean, where regularization is unnecessary.

The data does not benefit from feature selection (Lasso) or shrinkage (Ridge).

# from sklearn.neighbors import KNeighborsClassifier

# knn = KNeighborsClassifier(n_neighbors=2)
# knn.fit(X_train,y_train)
# knn.score(X_test,y_test)

# Model #3 XGBoost Regressor Modelling 

In [None]:
#pip install xgboost
# X_train.info()

# # Convert Min_CC and Max_CC to float64
# df_dropped['Min_CC'] = pd.to_numeric(df_dropped['Min_CC'], errors='coerce')  # Convert with error handling (invalid values to NaN)
# df_dropped['Max_CC'] = pd.to_numeric(df_dropped['Max_CC'], errors='coerce')  # Same for Max_CC

# df_filled = df_dropped.fillna(0)  # Replace all NaN values with 0
# df_filled

# # Check the result
# df_dropped.dtypes


In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=2)

# Train the model
model.fit(X_train, y_train)
# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Optionally, you can calculate the R^2 score
r2_score = model.score(X_test, y_test)
print(f"R^2 Score: {r2_score}")

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'n_estimators': [50, 100, 200],
    'subsample': [0.8, 0.9, 1.0]
}

# Initialize the model
model = xgb.XGBRegressor(objective='reg:squarederror')

# Use GridSearchCV to find the best parameters
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)
grid_search.fit(X_train, y_train)

# Best hyperparameters found by GridSearchCV
print(f"Best Parameters: {grid_search.best_params_}")

# Evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error of the best model: {mse}")

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for tuning
param_grid = {
     'learning_rate': [0.01],
    'max_depth': [5],
    'n_estimators': [50, 100, 200],
    'subsample': [0.8, 0.9, 1.0]
}

# GridSearchCV to optimize parameters
grid_search = GridSearchCV(xgb.XGBRegressor(objective='reg:squarederror'), param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Best parameters found
print(f"Best Parameters: {grid_search.best_params_}")

# Evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2_score = best_model.score(X_test, y_test)

print(f"Optimized MSE: {mse}")
print(f"Optimized R² Score: {r2_score}")


In [None]:
import matplotlib.pyplot as plt
model.fit(X_train, y_train)
# Plot feature importance
xgb.plot_importance(model, importance_type='weight')
plt.show()


In [None]:
residuals = y_test - y_pred

# Plot residuals
plt.scatter(y_pred, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Residual Plot")
plt.show()


In [None]:
# import joblib

# # Save the model to a file
# joblib.dump(model, 'xgboost_regression_model.pkl')

# # To load the model later
# loaded_model = joblib.load('xgboost_regression_model.pkl')


# SVR Model Start

In [None]:

from sklearn.svm import SVR  # Support Vector Regression
# Initialize and train the Support Vector Regression model
svm_model = SVR(kernel='rbf', C=100000, epsilon=0.1)
svm_model.fit(X_train, y_train)

# Make predictions
y_pred = svm_model.predict(X_test)


In [None]:
from sklearn.metrics import r2_score  # Ensure the function is imported

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)  

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

In [None]:
from sklearn.model_selection import cross_val_score

# Perform 5-fold cross-validation
cv_scores = cross_val_score(svm_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print(f"Cross-Validation MSE: {-cv_scores.mean()}")


In [None]:

# from sklearn.svm import SVR  # Support Vector Regression
# # Initialize and train the Support Vector Regression model
# svm_model = SVR(kernel='linear', C=10000, epsilon=0.1)
# svm_model.fit(X_train, y_train)

# # Make predictions

# y_pred = svm_model.predict(X_test)
# from sklearn.metrics import r2_score  # Ensure the function is imported

# # Evaluate the model
# mse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)  

# print(f'Mean Squared Error: {mse}')
# print(f'R-squared: {r2}')

In [None]:
# from sklearn.model_selection import cross_val_score

# # Perform 5-fold cross-validation
# cv_scores = cross_val_score(svm_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
# print(f"Cross-Validation MSE: {-cv_scores.mean()}")


In [None]:
# Plotting the results
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue', label='Predicted vs Actual')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--', label='Perfect Prediction')
plt.xlabel('Actual CHANGE %')
plt.ylabel('Predicted CHANGE %')
plt.title('SVM Regression: Actual vs Predicted CHANGE %')
plt.legend()
plt.show()


# Find SVR Fitting

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR  # Support Vector Regression
from sklearn.metrics import r2_score  # Ensure the function is imported

# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from sklearn.svm import SVR  # Support Vector Regression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Define parameter grid for tuning
param_grid = {
    'C': [0.01, 0.001, 0.1],
    'epsilon': [0.001, 0.01, 0.1],
    'kernel': ['rbf', 'linear', 'poly']
}

# Initialize the grid search
grid_search = GridSearchCV(SVR(), param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the grid search
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_}")


In [None]:

# Initialize and train the Support Vector Regression model
# svm_model = SVR(kernel='rbf', C=1, epsilon=0.01)
svm_model = SVR(kernel='linear', C=0.1, epsilon=0.001)
svm_model.fit(X_train, y_train)

# Make predictions
y_pred = svm_model.predict(X_test)


# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)  # This should work if no conflict occurs

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

In [None]:
svm_model.score(X_train, y_train)

In [None]:
from sklearn.model_selection import cross_val_score

scaler = StandardScaler()
X_scaled = scaler.fit_transform(col_x)

# Perform cross-validation on the SVR model
cv_scores = cross_val_score(svm_model, X_scaled, col_y, cv=5, scoring='neg_mean_squared_error')

# Calculate the average MSE across the folds
average_mse = -cv_scores.mean()
print(f"Cross-validated Mean Squared Error: {average_mse}")


##### Average MSE Score for SVR Model is 0.01295366418360382 (cross-validated)

In [None]:


# Assuming `X` is your feature matrix and `y` is the target variable
# Example: X = car_data.drop(columns=['TCO_Max']), y = car_data['TCO_Max']

# # Step 1: One-Hot Encoding for categorical features (e.g., 'Car Maker' and 'Segment of car')
# # Use pd.get_dummies() for One-Hot Encoding
# X = car_data.drop(columns=['TCO_Max'])  # Drop the target column 'TCO_Max'
# y = car_data['TCO_Max']

# Perform One-Hot Encoding
X_encoded = pd.get_dummies(col_x, drop_first=True)  # drop_first to avoid dummy variable trap



## Scaling for PCA

In [None]:

# Step 1: Standardize the features (PCA requires scaling)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(col_x)


## PCA with 32 components and Model Prediction

In [None]:
from sklearn.decomposition import PCA
# Step 2: Apply PCA
pca = PCA(n_components=25)  # Using the first 2 principal components, but adjust as needed
X_pca = pca.fit_transform(X_scaled)

# Step 3: Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, col_y, test_size=0.2, random_state=42)


In [None]:
X_scaled.info()

In [None]:
from sklearn.linear_model import LinearRegression
# Step 4: Fit a model (e.g., Linear Regression)
model = LinearRegression()
model.fit(X_train, y_train)

# Step 5: Make predictions
y_pred = model.predict(X_test)

# Step 6: Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


In [None]:
var_ratio={}
for n in range(2,50):
    pca=PCA(n_components=n)
    pca_components = pca.fit(X_scaled)
    var_ratio[n]=sum(pca.explained_variance_ratio_)
   

In [None]:
var_ratio

In [None]:
# plot the Cumulative Summation of the Explained Variance
plt.step(list(range(1,50)), np.cumsum(pca.explained_variance_ratio_))
plt.plot(np.cumsum(pca.explained_variance_ratio_))

# define the labels & title
plt.xlabel('Number of Components', fontsize = 15)
plt.ylabel('Variance (%)', fontsize = 15) 
plt.title('Explained Variance', fontsize = 20)

# show the plot
plt.show()

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import pandas as pd

# Assuming df_scaled is your scaled DataFrame
# Fit PCA with the maximum number of components
pca = PCA()
pca.fit(X_scaled)

# Explained variance ratio for each component
explained_variance_ratio = pca.explained_variance_ratio_

# Create a scree plot
plt.figure(figsize=(8, 6))
plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, marker='o', linestyle='--', color='b')
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.grid(True)
plt.xticks(range(1, len(explained_variance_ratio) + 1))  # Ensuring x-axis ticks are labeled for each component
plt.show()


Depicting that there are many features contributing to overall higher PCA value.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming df_scaled is your dataframe (replace it with your actual dataframe name)
# df_scaled = pd.read_csv('your_data.csv')  # For example, if you're reading data from a CSV file

# 1. Check basic information and data types
print("Basic Data Info:")
print(df_filled.info())

# 2. Summary statistics for numerical columns
print("\nSummary Statistics:")
print(df_filled.describe())

# 3. Check for missing values
print("\nMissing Values:")
print(df_filled.isnull().sum())

# 4. Distribution of variables (using histograms)
df_filled.hist(bins=30, figsize=(15, 10))
plt.suptitle('Histograms of Features')
plt.tight_layout()
plt.show()

# 5. Boxplots for identifying outliers
plt.figure(figsize=(15, 10))
sns.boxplot(data=df_filled, orient="h")
plt.title('Boxplot for Feature Distribution and Outliers')
plt.show()

# 6. Correlation Matrix and Heatmap (for numerical features)
plt.figure(figsize=(12, 10))
correlation_matrix = df_filled.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix Heatmap')
plt.show()

# 7. Pairplot (optional, works best with a smaller number of features)
# sns.pairplot(df_scaled)  # Uncomment if you have a smaller number of features for pairwise relationships
# plt.show()


In [None]:
from sklearn.model_selection import cross_val_score

# Perform 5-fold cross-validation
cv_scores = cross_val_score(model, col_x, col_y, cv=405, scoring='neg_mean_squared_error')

# Convert negative MSE scores to positive
cv_scores = -cv_scores

print(f"Mean Cross-Validation MSE: {cv_scores.mean()}")
print(f"Cross-Validation MSE Standard Deviation: {cv_scores.std()}")


In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv('CS_08_20.csv')

#1 Creating DataFrame

df = pd.DataFrame(data)

# General Information about the dataset
print(df.info())
print(df.describe())

# Visualization: Distribution of Price Categories (BP, MP, FP)
plt.figure(figsize=(10,6))
sns.boxplot(data=df[['BP_USD', 'MP_USD', 'FP_USD']])
plt.title('Price Distribution of BP, MP, FP')
plt.ylabel('Price (USD)')
plt.xticks([0, 1, 2], ['BP_USD', 'MP_USD', 'FP_USD'])
plt.show()





In [None]:

# Correlation Heatmap
corr_matrix = df[['BP_USD', 'MP_USD', 'FP_USD', 'Min_HP', 'Max_HP', 'FE_KML_MIN', 'FE_KML_MAX', 'TCO_Min', 'TCO_Max', 'YTD']].corr()
plt.figure(figsize=(12,8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

#, 'YTD_LY', 'CHANGE_%'

In [None]:
# Relationship between "YTD" vs "CHANGE_%"
plt.figure(figsize=(8,6))
sns.scatterplot(x=df['BP_USD'], y=df['Segment'], hue=df['Make'], palette='Set2', s=100)
plt.title('YTD vs CHANGE_% by Car Make')
plt.xlabel('YTD')
plt.ylabel('Change (%)')
plt.show()



In [None]:
# Barplot of Car Makes with respect to "CHANGE_%"
plt.figure(figsize=(10,6))
# sns.barplot(x='Make', y='BP_USD', data=df, palette='viridis')
sns.barplot(x='Make', y='FE_KML_MAX', data=df, palette='Paired')
plt.title('Max Fuel Economy by Car Make')
plt.xlabel('Car Make')
plt.ylabel('FE_KML_MAX')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Car Make vs Fuel Efficiency
plt.figure(figsize=(12,6))
sns.boxplot(x='Make', y='FE_KML_MIN', data=df)
plt.title('Fuel Efficiency (Min) by Car Make')
plt.xlabel('Car Make')
plt.ylabel('Fuel Efficiency (km/l)')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Pairplot of relevant features
sns.pairplot(df[['BP_USD', 'MP_USD', 'FP_USD', 'Min_HP', 'Max_HP', 'FE_KML_MIN']])
plt.suptitle('Pairplot of Car Features', y=1.02)
plt.show()


In [None]:
# Boxplot to show the distribution of TCO (Total Cost of Ownership) with respect to different segments
plt.figure(figsize=(10,6))
sns.boxplot(x='Segment', y='TCO_Max', data=df)
plt.title('Total Cost of Ownership (TCO_Max) by Segment')
plt.xlabel('Segment')
plt.ylabel('TCO_Max (USD)')
plt.show()


# Additional EDA 

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error

In [None]:
cars = pd.read_csv('CS_08_20.csv')

cars_BF = cars
cars = cars.fillna(0)  # Replace all NaN values with 0

cars.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings as wr
wr.filterwarnings('ignore')

# Set Seaborn style
sns.set_style("darkgrid")

# Identify numerical columns
numerical_columns = cars.select_dtypes(include=["int64", "float64"]).columns

# Plot distribution of each numerical feature
plt.figure(figsize=(14, len(numerical_columns) * 3))
for idx, feature in enumerate(numerical_columns, 1):
    plt.subplot(len(numerical_columns), 2, idx)
    sns.histplot(cars[feature], kde=True)
    plt.title(f"{feature} | Skewness: {round(cars[feature].skew(), 2)}")

# Adjust layout and show plots
plt.tight_layout()
plt.show()


In [None]:
#Univariate
quality_counts = cars['Segment'].value_counts()

# Using Matplotlib to create a count plot
plt.figure(figsize=(5, 5))
plt.bar(quality_counts.index, quality_counts, color = 'Blue') #, color='darkpink')
plt.title('Segment wise Car sales')
plt.xlabel('Segment')
plt.ylabel('Count')
plt.show()

Mid Size Luxury Cars dominate the market ( C2 and C1 ) during economic crisis as well

High end & Premium Luxury SUV's & Pickup type vehicles are also great in terms of sales

Ultra Luxury Car Sales & Speciality/Sports Vehicles represents considerable number and edges out Family Sedans (D2)

### Interesting Fact: Luxury car is exceeding common man cars and economic car ( Segment : A & B ) sales

In [None]:
#Multivariate

plt.figure(figsize=(15, 10))

# Using Seaborn to create a heatmap
sns.heatmap(cars.corr(), annot=True, fmt='.2f', cmap='Pastel2', linewidths=2)
plt.title('Correlation Heatmap')
plt.show()