# API Project Notebook

In [1]:
import pandas as pd

# Reload the dataset due to environment reset
file_path = "sales_data.csv"
sales_data = pd.read_csv(file_path)

# Display the first few rows of the dataset to confirm it loaded correctly
sales_data.head()


Unnamed: 0,Brands,Segmentations,Models,USA,Region,ADI,Drive Type,Fuel Type,Model,Model Year,...,2023 - Oct,2023 - Nov,2023 - Dec,2024 - Jan,2024 - Feb,2024 - Mar,2024 - Apr,2024 - May,2024 - Jun,Total
0,INFINITI,Near Luxury Car,Q60,USA,Mid-Atlantic,Baltimore,AWD,Gasoline,Q60,2022,...,2.0,1.0,,,,,,,,4
1,INFINITI,Near Luxury Car,Q50,USA,Mid-Atlantic,Baltimore,AWD,Gasoline,Q50,2024,...,,,,,2.0,,3.0,9.0,3.0,17
2,INFINITI,Near Luxury Car,Q50,USA,Mid-Atlantic,Baltimore,AWD,Gasoline,Q50,2023,...,9.0,6.0,6.0,5.0,2.0,4.0,2.0,,1.0,54
3,INFINITI,Near Luxury SUV,QX50,USA,Mid-Atlantic,Baltimore,AWD,Gasoline,QX50,2024,...,,,2.0,5.0,7.0,6.0,8.0,1.0,5.0,34
4,INFINITI,Near Luxury SUV,QX50,USA,Mid-Atlantic,Baltimore,FWD,Gasoline,QX50,2023,...,1.0,,,,,,,,,1


### Data  Cleaning & Imputation Work

In [2]:
# Check for missing values
missing_values = sales_data.isnull().sum()

# Check for zero values in numerical columns
zero_values = (sales_data == 0).sum()

# Display columns with missing or zero values
issues = pd.DataFrame({
    'Missing Values': missing_values[missing_values > 0],
    'Zero Values': zero_values[zero_values > 0]
}).fillna('-')


issues.reset_index(inplace=True)
issues.columns = ['Column', 'Missing Values', 'Zero Values']
issues


Unnamed: 0,Column,Missing Values,Zero Values
0,2023 - Aug,119823.0,1903.0
1,2023 - Dec,112923.0,2018.0
2,2023 - Jul,122797.0,1923.0
3,2023 - Nov,117375.0,1907.0
4,2023 - Oct,117195.0,1880.0
5,2023 - Sep,117706.0,1929.0
6,2024 - Apr,121594.0,1467.0
7,2024 - Feb,121423.0,1823.0
8,2024 - Jan,122054.0,2198.0
9,2024 - Jun,121039.0,1932.0


##### Re-Shaping the data

In [5]:
# Drop the 'USA' and 'Total' columns from the original dataset
sales_data = sales_data.drop(columns=['USA', 'Total'], errors='ignore')

# Reshape the data using pivot_longer approach after cleaning
sales_data = sales_data.melt(
    id_vars=['Brands', 'Segmentations', 'Models', 'Region', 'ADI', 
             'Drive Type', 'Fuel Type', 'Model', 'Model Year'], 
    var_name='Date', 
    value_name='Sales Volume'
)

# Clean up the reshaped data
sales_data['Sales Volume'] = sales_data['Sales Volume'].fillna(0)
sales_data['Date'] = sales_data['Date'].str.strip()

# The reshaped data is now ready for further processing
sales_data


Unnamed: 0,Brands,Segmentations,Models,Region,ADI,Drive Type,Fuel Type,Model,Model Year,Date,Sales Volume
0,INFINITI,Near Luxury Car,Q60,Mid-Atlantic,Baltimore,AWD,Gasoline,Q60,2022,2023 - Jul,0.0
1,INFINITI,Near Luxury Car,Q50,Mid-Atlantic,Baltimore,AWD,Gasoline,Q50,2024,2023 - Jul,0.0
2,INFINITI,Near Luxury Car,Q50,Mid-Atlantic,Baltimore,AWD,Gasoline,Q50,2023,2023 - Jul,0.0
3,INFINITI,Near Luxury SUV,QX50,Mid-Atlantic,Baltimore,AWD,Gasoline,QX50,2024,2023 - Jul,0.0
4,INFINITI,Near Luxury SUV,QX50,Mid-Atlantic,Baltimore,FWD,Gasoline,QX50,2023,2023 - Jul,0.0
...,...,...,...,...,...,...,...,...,...,...,...
2474935,VinFast,Mainstream EV SUV,VF 8,Southern,Charleston,,Electric,VF8,2023,2024 - Jun,0.0
2474936,VinFast,Mainstream EV SUV,VF 8,Mountain States,Sioux Falls,,Electric,VF8,2023,2024 - Jun,0.0
2474937,VinFast,Mainstream EV SUV,VF 8,Central,Detroit,,Electric,VF8,2023,2024 - Jun,2.0
2474938,VinFast,Mainstream EV SUV,VF 8,Central,Lexington,,Electric,VF8,2023,2024 - Jun,0.0


#### Regression Analysis


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import statsmodels.api as sm

# Select relevant columns for analysis
data = sales_data[['Sales Volume', 'Region', 'Drive Type', 'Fuel Type', 'Model Year', 'Segmentations']]

# Drop missing values
data = data.dropna()

# One-hot encode categorical variables
data = pd.get_dummies(data, columns=['Region', 'Drive Type', 'Fuel Type', 'Segmentations'], drop_first=True)

# Split the data into features (X) and target (y)
X = data.drop(columns=['Sales Volume'])
y = data['Sales Volume']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the Linear Regression model using sklearn
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the sklearn model
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"R-squared (sklearn): {r2}")
print(f"Mean Squared Error (sklearn): {mse}")

# Add a constant for statsmodels regression
X_train_sm = sm.add_constant(X_train)

# Fit the statsmodels regression
model_sm = sm.OLS(y_train, X_train_sm).fit()

# Display the statsmodels summary
print(model_sm.summary())
