In [9]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = pd.read_csv('sales_data.csv')

# Display the first few rows of the dataset
data.head()


Unnamed: 0,quantity_sold,quantity_in_stock,feature1,feature2,target_variable
0,82,15,card,surface,2.501076
1,36,32,fall,itself,22.321074
2,95,14,laugh,debate,67.669949
3,70,12,personal,economy,59.049251
4,5,4,yard,however,9.369524


In [10]:
# One-hot encode categorical variables like 'feature1' and 'feature2'
data = pd.get_dummies(data, columns=['feature1', 'feature2'], prefix=['feat1', 'feat2'])

# Display the modified dataset after one-hot encoding
data.head()


Unnamed: 0,quantity_sold,quantity_in_stock,target_variable,feat1_American,feat1_Congress,feat1_Democrat,feat1_Mr,feat1_Republican,feat1_TV,feat1_ability,...,feat2_worry,feat2_would,feat2_write,feat2_writer,feat2_yard,feat2_yeah,feat2_you,feat2_young,feat2_your,feat2_yourself
0,82,15,2.501076,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,36,32,22.321074,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,95,14,67.669949,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,70,12,59.049251,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,5,4,9.369524,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [11]:
# Separate features and target variable
X = data.drop('target_variable', axis=1)
y = data['target_variable']

# Display the features (X) and target variable (y)
X.head()
y.head()


0     2.501076
1    22.321074
2    67.669949
3    59.049251
4     9.369524
Name: target_variable, dtype: float64

In [12]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of training and testing sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((800, 1247), (200, 1247), (800,), (200,))

In [13]:
# Standardize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Display the scaled features
X_train_scaled[:5, :]


array([[-0.85605689,  1.29540779, -0.03537746, ...,  0.        ,
         0.        , -0.05006262],
       [-1.20225319, -0.64217738, -0.03537746, ...,  0.        ,
         0.        , -0.05006262],
       [-1.27149245,  0.16808551, -0.03537746, ...,  0.        ,
         0.        , -0.05006262],
       [ 1.29036015,  0.20331433, -0.03537746, ...,  0.        ,
         0.        , -0.05006262],
       [-1.47921023,  0.59083137, -0.03537746, ...,  0.        ,
         0.        , -0.05006262]])

In [16]:
# Initialize the model (Random Forest Regressor)
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train_scaled, y_train)


In [15]:
# Make predictions on the test set
predictions = model.predict(X_test_scaled)

# Evaluate the model
mae = mean_absolute_error(y_test, predictions)
print(f'Mean Absolute Error: {mae}')


Mean Absolute Error: 24.882548332665728
