In [169]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from scipy import stats


In [170]:
data_df = pd.read_csv('order_detail.csv')
data_df.head()

Unnamed: 0,Row ID,Order ID,Product ID,Sales,Quantity,Discount,Profit,Shipping Cost
0,1,MX-2014-143658,OFF-LA-10002782,13.08,3,0.0,4.56,1.033
1,2,MX-2012-155047,FUR-FU-10004015,252.16,8,0.0,90.72,13.449
2,3,MX-2012-155047,FUR-BO-10002352,193.28,2,0.0,54.08,9.627
3,4,MX-2012-155047,OFF-BI-10004428,35.44,4,0.0,4.96,1.371
4,5,MX-2012-155047,OFF-AR-10004594,71.6,2,0.0,11.44,3.787


In [171]:
data_df.describe()

Unnamed: 0,Row ID,Sales,Quantity,Discount,Profit,Shipping Cost
count,49670.0,49670.0,49670.0,49670.0,49670.0,49670.0
mean,25763.380974,243.840355,3.470787,0.14307,28.392264,26.041125
std,14838.084164,485.963091,2.274854,0.212046,172.39952,56.870281
min,1.0,0.444,1.0,0.0,-6599.98,0.002
25%,12749.25,30.384,2.0,0.0,0.0,2.58
50%,26073.5,83.97,3.0,0.0,9.154,7.66
75%,38522.75,247.83,5.0,0.2,36.2664,23.969
max,51290.0,22638.5,14.0,0.85,8399.98,933.57


In [172]:
data_df['Price'] = data_df['Sales'] / data_df['Quantity']
data_df['Profit for single'] = data_df['Profit'] / data_df['Quantity']
data_df

Unnamed: 0,Row ID,Order ID,Product ID,Sales,Quantity,Discount,Profit,Shipping Cost,Price,Profit for single
0,1,MX-2014-143658,OFF-LA-10002782,13.08,3,0.0,4.56,1.033,4.36,1.52
1,2,MX-2012-155047,FUR-FU-10004015,252.16,8,0.0,90.72,13.449,31.52,11.34
2,3,MX-2012-155047,FUR-BO-10002352,193.28,2,0.0,54.08,9.627,96.64,27.04
3,4,MX-2012-155047,OFF-BI-10004428,35.44,4,0.0,4.96,1.371,8.86,1.24
4,5,MX-2012-155047,OFF-AR-10004594,71.60,2,0.0,11.44,3.787,35.80,5.72
...,...,...,...,...,...,...,...,...,...,...
49665,51286,HU-2012-7730,OFF-AVE-10004570,11.07,1,0.0,3.42,1.980,11.07,3.42
49666,51287,HU-2012-7730,TEC-LOG-10004419,61.44,2,0.0,18.42,13.020,30.72,9.21
49667,51288,HU-2012-7730,OFF-BOS-10002705,80.52,4,0.0,20.88,8.780,20.13,5.22
49668,51289,HU-2012-7730,OFF-ENE-10004132,130.44,4,0.0,33.84,18.970,32.61,8.46


In [173]:
df_single = data_df.drop(['Row ID', 'Order ID', 'Product ID', 'Discount', 'Sales', 'Quantity', 'Profit', 'Shipping Cost'], axis=1)
df_single


Unnamed: 0,Price,Profit for single
0,4.36,1.52
1,31.52,11.34
2,96.64,27.04
3,8.86,1.24
4,35.80,5.72
...,...,...
49665,11.07,3.42
49666,30.72,9.21
49667,20.13,5.22
49668,32.61,8.46


In [174]:
df_more_features = data_df.drop(['Row ID', 'Order ID', 'Product ID', 'Price', 'Profit for single'], axis=1)
df_more_features

Unnamed: 0,Sales,Quantity,Discount,Profit,Shipping Cost
0,13.08,3,0.0,4.56,1.033
1,252.16,8,0.0,90.72,13.449
2,193.28,2,0.0,54.08,9.627
3,35.44,4,0.0,4.96,1.371
4,71.60,2,0.0,11.44,3.787
...,...,...,...,...,...
49665,11.07,1,0.0,3.42,1.980
49666,61.44,2,0.0,18.42,13.020
49667,80.52,4,0.0,20.88,8.780
49668,130.44,4,0.0,33.84,18.970


In [177]:
df = df_more_features
X = df.drop('Profit', axis=1)
y = df['Profit']
state = 'more features'


In [175]:
# df = df_single
# X = df.drop('Profit for single', axis=1)
# y = df['Profit for single']
# state = 'single'

In [178]:
z_scores = np.abs(stats.zscore(X))

# Define a threshold to identify outliers
threshold = 3

# Remove outliers using the Z-score method
X_new = X[(z_scores < threshold).all(axis=1)]
y_new = y[X_new.index]

In [176]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# plt.figure(figsize=(10, 6))
# sns.boxplot(data=X_new)
# plt.title('Boxplot of Features')
# plt.xlabel('Features')
# plt.ylabel('Values')
# plt.xticks(rotation=45)
# plt.show()


In [180]:
# Split the data into train, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.11, random_state=42)

In [181]:
#Perform feature scaling on the numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [183]:
print(state)

# Define a list of models to evaluate
models = [
    LinearRegression(),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    KNeighborsRegressor()
]

# Train and evaluate each model
for model in models:
    model.fit(X_train, y_train)

    print('\n....................\n')
    print(f"Model: {type(model).__name__}")
    print('....')
    # Make predictions on the train set
    y_train_pred = model.predict(X_train)

    # Evaluate the model on the train set using MSE and R2 score
    train_r2 = r2_score(y_train, y_train_pred)
    print("Train R2 score:", train_r2)

    print('....')
    y_val_pred = model.predict(X_val)
    r2 = r2_score(y_val, y_val_pred)
    print(f"Validation R2 score: {r2}")
    print('....')
    y_test_pred = model.predict(X_test)

    # Evaluate the model on the test set using mean squared error
    test_r2 = r2_score(y_test, y_test_pred)
    print("Test R2 score:", test_r2)

# Select the best model based on R2 score
best_model = models[np.argmax([r2_score(y_val, model.predict(X_val)) for model in models])]

# Evaluate the best model on the test set
y_test_pred = best_model.predict(X_test)
r2 = r2_score(y_test, y_test_pred)
print('\n....................\n')
print(f"Best Model: {type(best_model).__name__}")
print(f"Test R2 score: {r2}")

more features

....................

Model: LinearRegression
....
Train R2 score: 0.2772476181555602
....
Validation R2 score: 0.21854204863316562
....
Test R2 score: 0.279506547616705

....................

Model: DecisionTreeRegressor
....
Train R2 score: 0.9999904944192479
....
Validation R2 score: 0.5116561874320382
....
Test R2 score: 0.3938722155826462

....................

Model: RandomForestRegressor
....
Train R2 score: 0.9525418902791861
....
Validation R2 score: 0.7007022998438195
....
Test R2 score: 0.6673990031740299

....................

Model: KNeighborsRegressor
....
Train R2 score: 0.7559577427629953
....
Validation R2 score: 0.6242853992977564
....
Test R2 score: 0.6484569122631563

....................

Best Model: RandomForestRegressor
Test R2 score: 0.6673990031740299


In [182]:
print(state)

# Create model
model = RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_split=5, min_samples_leaf=5, max_features='log2', random_state=42)

# cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
# print("Cross-Validation Scores:", cv_scores)


# Train the model
model.fit(X_train, y_train)

# Make predictions on the train set
y_train_pred = model.predict(X_train)

# Evaluate the model on the train set using MSE and R2 score
train_mse = mean_squared_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)
print("Train MSE:", train_mse)
print("Train R2 score:", train_r2)

# Make predictions on the validation set
y_val_pred = model.predict(X_val)

# Evaluate the model on the validation set using mean squared error
val_mse = mean_squared_error(y_val, y_val_pred)
print("Validation MSE:", val_mse)
val_r2 = r2_score(y_val, y_val_pred)
print("Validation R2 score:", val_r2)

# Make predictions on the test set
y_test_pred = model.predict(X_test)

# Evaluate the model on the test set using mean squared error
test_mse = mean_squared_error(y_test, y_test_pred)
print("Test MSE:", test_mse)
test_r2 = r2_score(y_test, y_test_pred)
print("Test R2 score:", test_r2)

more features
Train MSE: 2313.1693933515394
Train R2 score: 0.7623710148126185
Validation MSE: 3267.9028519668577
Validation R2 score: 0.7127751450037665
Test MSE: 2998.5038556385407
Test R2 score: 0.7023619101254084


## Hear we're going to experiment with polynomial regression

In [None]:
##