In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# Set plot style
sns.set_style('whitegrid')
%matplotlib inline

In [None]:
df = pd.read_csv('global_superstore_clean.csv')

# Focus on numerical columns for our model
numerical_df = df[['Sales', 'Quantity', 'Discount', 'Profit', 'Shipping_Cost']]

# Basic statistics
print(numerical_df.describe())

In [None]:
# Pairplot to see relationships
sns.pairplot(numerical_df)
plt.suptitle('Pairplot of Key Numerical Variables', y=1.02)
plt.show()

# Correlation Heatmap
plt.figure(figsize=(10, 7))
sns.heatmap(numerical_df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Numerical Features')
plt.show()
# Initial insights: Sales and Profit are positively correlated. Discount and Profit are negatively correlated.

In [None]:
# We want to predict 'Profit'
# Our features will be 'Sales', 'Discount', and 'Shipping_Cost'
features = ['Sales', 'Discount', 'Shipping_Cost']
target = 'Profit'

X = df[features]
y = df[target]

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate performance metrics
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Model R-squared (R²): {r2:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
# R² tells us that our model explains ~55% of the variance in profit, which is a decent start.

In [None]:
coefficients = pd.DataFrame(model.coef_, features, columns=['Coefficient'])
print(coefficients)

print("\nInterpretation:")
print(f"- For every $1 increase in Sales, Profit is predicted to increase by ${model.coef_[0]:.2f}, holding other factors constant.")
print(f"- For every 1.0 (100%) increase in Discount, Profit is predicted to decrease by ${model.coef_[1]:.2f}, holding other factors constant.")
print(f"- For every $1 increase in Shipping Cost, Profit is predicted to decrease by ${model.coef_[2]:.2f}, holding other factors constant.")
# This is a key insight: shipping costs are eroding profits significantly.

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--r', linewidth=2)
plt.xlabel('Actual Profit')
plt.ylabel('Predicted Profit')
plt.title('Actual vs. Predicted Profit')
plt.show()