In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("/content/BigMart Sales Data.csv")

In [None]:
df.head()

In [None]:
df.drop(['Item_Identifier', 'Outlet_Identifier'], axis =1, inplace=True)

In [None]:
df.info()

In [None]:
df.columns

##Handling Missing values

In [None]:
df.isnull().sum()

In [None]:
df['Item_Weight'].fillna(df['Item_Weight'].mean(), inplace=True)

In [None]:
df['Outlet_Size'].fillna(df['Outlet_Size'].mode()[0], inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.describe()

## Convert categorical features into numerical features.

In [None]:
df.info()

In [None]:
# one-hot encoding for nominal data
one_hot_encoded_data = pd.get_dummies(df,columns = ['Item_Fat_Content', "Item_Type",'Outlet_Location_Type','Outlet_Type'], drop_first=True)

In [None]:
one_hot_encoded_data.head()

In [None]:
#label encoding for ordinal features
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
one_hot_encoded_data['Outlet_Size'] = le.fit_transform(one_hot_encoded_data['Outlet_Size'])

In [None]:
one_hot_encoded_data.head()

In [None]:
# scaling the numerical features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
one_hot_encoded_data['Item_Weight'] = scaler.fit_transform(one_hot_encoded_data[['Item_Weight']])
one_hot_encoded_data['Item_Visibility'] = scaler.fit_transform(one_hot_encoded_data[['Item_Visibility']])
one_hot_encoded_data['Item_MRP'] = scaler.fit_transform(one_hot_encoded_data[['Item_MRP']])

In [None]:
one_hot_encoded_data.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

In [None]:
x = one_hot_encoded_data.drop('Item_Outlet_Sales', axis=1)
y = one_hot_encoded_data['Item_Outlet_Sales']

In [None]:
x_train, x_text, y_train, y_text = train_test_split(x,y, test_size = 0.2, random_state = 42)


In [None]:
lr = LinearRegression()
rf = RandomForestRegressor(random_state = 42)

In [None]:
lr.fit(x_train, y_train)
rf.fit(x_train, y_train)

## Prediction and Evaluation

In [None]:
y_pred_lr = lr.predict(x_text)
y_pred_rf = rf.predict(x_text)

In [None]:
def evaluate_model(y_true, y_pred):
  mae = mean_absolute_error(y_true, y_pred)
  mse = mean_absolute_error(y_true, y_pred)
  rmse =  np.sqrt(mse)
  r2 = r2_score(y_text, y_pred)
  return mae,mse, rmse, r2

# Linear regression evaluation
mae_lr, mse_lr, rmse_lr,r2_lr = evaluate_model(y_text, y_pred_lr)
print("Linear Regression:")
print(f"MAE: {mae_lr:.2f}, MSE: {mse_lr:.2f}, RMSE: {rmse_lr:.2f}, R2 Score: {r2_lr:.2f}")

# Random Forest Evaluation
mae_rf, mse_rf, rmse_rf, r2_rf = evaluate_model(y_text, y_pred_rf)
print("\nRandom Forest:")
print(f"MAE: {mae_rf:.2f}, MSE: {mse_rf:.2f}, RMSE: {rmse_rf:.2f}, R2 Score: {r2_rf:2f}")

