## Assignment 1


In [10]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from statsmodels.formula.api import ols
from scipy.stats import norm, probplot
from sklearn.preprocessing import StandardScaler


db_dir = os.getcwd()
df = pd.read_csv(db_dir + r'/data/train.csv')

In [25]:
# Calculate Z-scores for each numeric column
z_scores = np.abs(stats.zscore(df.select_dtypes(include=[np.number])))

# Identify indices of rows that have any Z-score greater than 3
outlier_indices = np.where(z_scores > 3)[0]

# Remove rows with outliers
df_no_outliers = df.drop(index=outlier_indices)

# Verify the shape of the dataset before and after removing outliers
print("Original dataset shape:", df.shape)


# Calculate the percentage of missing values for each column
missing_percentage = df_no_outliers.isnull().mean() * 100

# Drop columns where more than 20% of the data is missing
columns_to_drop = missing_percentage[missing_percentage > 20].index
df_no_outliers.drop(columns=columns_to_drop, inplace=True)

print("Dataset shape after removing outliers:", df_no_outliers.shape)
# Optionally, save the cleaned dataset
# df_no_outliers.to_csv('/mnt/data/cleaned_data_no_outliers.csv', index=False)

Original dataset shape: (1460, 81)
Dataset shape after removing outliers: (1037, 75)


In [26]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

# Assuming df_no_outliers is your cleaned dataset
# SalePrice is the target variable and the rest are features
X = df_no_outliers.drop(columns=['SalePrice'])  # Features
y = df_no_outliers['SalePrice']  # Target variable

# Handle categorical variables (convert them to numeric if needed)
X = pd.get_dummies(X, drop_first=True)

# Check for any NaN or infinite values and remove them
X.replace([np.inf, -np.inf], np.nan, inplace=True)
y.replace([np.inf, -np.inf], np.nan, inplace=True)

# Combine X and y to ensure dropping rows happens consistently
df_combined = pd.concat([X, y], axis=1)

# Drop rows with any NaN or infinite values
df_combined.dropna(inplace=True)

# Separate X and y again after cleaning
X = df_combined.drop(columns=['SalePrice'])
y = df_combined['SalePrice']

# Ensure all data is numeric
X = X.apply(pd.to_numeric)
y = pd.to_numeric(y)

# Debugging: Check for any remaining object types
print("Data types in X after conversion:")
print(X.dtypes)
print("Data types in y after conversion:")
print(y.dtype)

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=0)

# Add a constant term for the intercept
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

# Convert DataFrame to numpy array to ensure no dtype issues
X_train = np.asarray(X_train, dtype=np.float64)
y_train = np.asarray(y_train, dtype=np.float64)

# Fit the OLS model
model = sm.OLS(y_train, X_train)
results = model.fit()

# Print the OLS model summary
print("OLS Model Summary:")
print(results.summary())

# To handle multicollinearity - Calculating VIF
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns  # Exclude the constant for VIF
vif_data["VIF"] = [variance_inflation_factor(X_train[:, 1:], i) for i in range(1, X_train.shape[1])]
print("\nVIF Data:")
print(vif_data)

# Predicting on the training data
predicted_y = results.predict(X_train)
residuals = y_train - predicted_y

# Plotting the residuals
fig, ax = plt.subplots(1, 3, figsize=(18, 5))

# Histogram of residuals
sns.histplot(residuals, kde=True, ax=ax[0])
ax[0].set_title('Histogram of Residuals')
ax[0].set_xlabel('Residuals')
ax[0].set_ylabel('Frequency')

# Q-Q plot of residuals
stats.probplot(residuals, dist="norm", plot=ax[1])
ax[1].set_title('Q-Q Plot of Residuals')

# Residuals vs. fitted values
ax[2].scatter(results.fittedvalues, residuals)
ax[2].axhline(0, color='red', linestyle='dashed', linewidth=2)
ax[2].set_title('Residuals vs. Fitted Values')
ax[2].set_xlabel('Fitted Values')
ax[2].set_ylabel('Residuals')

plt.tight_layout()
plt.show()

Data types in X after conversion:
Id                         int64
MSSubClass                 int64
LotFrontage              float64
LotArea                    int64
OverallQual                int64
                          ...   
SaleCondition_AdjLand       bool
SaleCondition_Alloca        bool
SaleCondition_Family        bool
SaleCondition_Normal        bool
SaleCondition_Partial       bool
Length: 207, dtype: object
Data types in y after conversion:
int64
OLS Model Summary:
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.960
Model:                            OLS   Adj. R-squared:                  0.942
Method:                 Least Squares   F-statistic:                     54.94
Date:                Mon, 12 Aug 2024   Prob (F-statistic):          2.04e-219
Time:                        07:30:28   Log-Likelihood:                -6689.9
No. Observations:                 610   AIC

  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss


IndexError: index 206 is out of bounds for axis 1 with size 206

In [30]:
import pandas as pd

# Create the data for the process steps and their capacities
data = {
    "Process Step": [
        "Truck Arrival", 
        "Unload into Dumper Bins", 
        "Washing", 
        "Bagging Machine 1", 
        "Bagging Machine 2", 
        "Total Bagging"
    ],
    "Capacity": [
        "6 trucks/hour", 
        "8 truckloads", 
        "5 truckloads/hour", 
        "2 truckloads/hour", 
        "2 truckloads/hour", 
        "4 truckloads/hour"
    ]
}

# Convert the data into a pandas DataFrame
df = pd.DataFrame(data)

# Save the DataFrame as an Excel file
file_path = "/Cranberry_Process_Flow_Updated.xlsx"
df.to_excel("Cranberry_Process_Flow_Updated.xlsx", index=False)

file_path

'/Cranberry_Process_Flow_Updated.xlsx'