In [19]:
# !pip install scikit-learn
# !pip install openpyxl
# !pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.12.4 imblearn-0.0


### Importing required libraries

In [141]:
# Required Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
# import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from sklearn.preprocessing import OneHotEncoder


### Constants and Functions

In [142]:
#####CONSTANTS####
FILE_PATH="file:///Users/samarthsingh/Downloads/"
FILE_NAME="complain_master_dataset.csv"
CONSUMER_CONSENT_PROVIDED_BINARY_ENCONDING={
    # we are categorizing nan as a separate category -1
    'Consent provided': 1, 
    'Consent not provided': 0,
    'Consent withdrawn': 0,  # You can change this to -1 if you wish to handle it separately
    'Other': -1  # Treat 'Other' as a distinct category
}
CONSUMER_DISPUTED_BINARY_ENCODING={
    # we are categorizing nan as a separate category -1
    'Yes': 1,  # Disputed
    'No': 0    # Not disputed
}
###COMMON FUNCTIONS####
def fixing_datatype(df):
    """
    :param df: This is the initial dataset we have read
    :return: Returns the dataset by enforcing the schema we have finalized
    """
    df['Company response to consumer'] = df['Company response to consumer'].astype(str)
    return df


### Loading Dataset and enforcing schema

In [143]:
# 1. Load dataset from Excel file
  # Replace with your file path
df = pd.read_csv(FILE_PATH+FILE_NAME)
df=fixing_datatype(df)

### Feature Engineering

In [144]:

# 2. Preprocessing

#BOOLEAN COLUMNS

df['Consumer consent provided?'] = df['Consumer consent provided?'].replace(CONSUMER_CONSENT_PROVIDED_BINARY_ENCONDING)
# Handle NaN values by filling them with -1 (for unknown consent status) for Consumer Consent Provided column 
df['Consumer consent provided?']=df['Consumer consent provided?'].fillna(-1)
#  Treat NaN as a separate category (-1) for Consumer Disputed column
df['Consumer disputed?'] = df['Consumer disputed?'].replace(CONSUMER_DISPUTED_BINARY_ENCODING)
df['Consumer disputed?']=df['Consumer disputed?'].fillna(-1)

#CATEGORICAL COLUMN

# Frequency encoding: count the occurrences of each company for company column
df['Company'] = df['Company'].map(df['Company'].value_counts())
# Frequency encoding for product column
df['Product'] = df['Product'].map(df['Product'].value_counts())
# Treat NaN as a separate category as 'Unknown' for Sub-Product column
df['Sub-product']=df['Sub-product'].fillna('Unknown')
# Frequency encoding for Sub-Product column
df['Sub-product'] = df['Sub-product'].map(df['Sub-product'].value_counts())
# Treat NaN as a separate category as 'Unknown' for Issue column
df['Issue']=df['Issue'].fillna('Unknown')
# Frequency encoding for Issue column
df['Issue'] = df['Issue'].map(df['Issue'].value_counts())
# Treat NaN as 'Unknown' for Sub-Issue column
df['Sub-issue']=df['Sub-issue'].fillna('Unknown')
# Frequency encoding for Sub-Issue column
df['Sub-issue'] = df['Sub-issue'].map(df['Sub-issue'].value_counts())

#one hot encoding for Submitted via column
encoder = OneHotEncoder()
# Apply one-hot encoding
encoded_data = encoder.fit_transform(df[['Submitted via']])
encoded_df = pd.DataFrame(encoded_data.toarray(), columns=encoder.get_feature_names_out(['Submitted via']))
# Concatenate the original DataFrame with the one-hot encoded columns
df = pd.concat([df, encoded_df], axis=1)


# Removing the nan from the target variable to prevent problems with undersampling 
df=df[~(df['Company response to consumer']=='nan')]



  df['Consumer consent provided?'] = df['Consumer consent provided?'].replace(CONSUMER_CONSENT_PROVIDED_BINARY_ENCONDING)
  df['Consumer disputed?'] = df['Consumer disputed?'].replace(CONSUMER_DISPUTED_BINARY_ENCODING)


### Splitting data into training and testing

In [145]:
# 3. Split the data into training and testing sets

X = df.drop(columns=['Company response to consumer']) # Features (all columns except the last)
y = df['Company response to consumer']    # Target (last column)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

### Undersampling to handle class imbalance

In [146]:
# Check the class distribution before undersampling
print("Class distribution before undersampling:", Counter(y_train))

# Initialize RandomUnderSampler
rus = RandomUnderSampler(random_state=42)

# Apply undersampling on the training data
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

# Check the class distribution after undersampling
print("Class distribution after undersampling:", Counter(y_train_resampled))


Class distribution before undersampling: Counter({'Closed with explanation': 465571, 'Closed with non-monetary relief': 213040, 'In progress': 45441, 'Closed with monetary relief': 19740, 'Closed': 2147, 'Closed without relief': 2109, 'Untimely response': 1293, 'Closed with relief': 656})
Class distribution after undersampling: Counter({'Closed': 656, 'Closed with explanation': 656, 'Closed with monetary relief': 656, 'Closed with non-monetary relief': 656, 'Closed with relief': 656, 'Closed without relief': 656, 'In progress': 656, 'Untimely response': 656})


### Linear Regression Model

In [147]:
# 4. Train Linear Regression Model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Predictions for Linear Regression
y_pred_linear_train = linear_model.predict(X_train)
y_pred_linear_test = linear_model.predict(X_test)

# Calculate the metrics for Linear Regression
mse_linear_train = mean_squared_error(y_train, y_pred_linear_train)
mse_linear_test = mean_squared_error(y_test, y_pred_linear_test)
r2_linear_train = r2_score(y_train, y_pred_linear_train)
r2_linear_test = r2_score(y_test, y_pred_linear_test)

print("Linear Regression:")
print(f"Train MSE: {mse_linear_train}, Test MSE: {mse_linear_test}")
print(f"Train R²: {r2_linear_train}, Test R²: {r2_linear_test}")

ValueError: could not convert string to float: '2024-05-14'

### Random forest model

In [148]:
# 5. Train Random Forest Regressor Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions for Random Forest Regressor
y_pred_rf_train = rf_model.predict(X_train)
y_pred_rf_test = rf_model.predict(X_test)

# Calculate the metrics for Random Forest Regressor
mse_rf_train = mean_squared_error(y_train, y_pred_rf_train)
mse_rf_test = mean_squared_error(y_test, y_pred_rf_test)
r2_rf_train = r2_score(y_train, y_pred_rf_train)
r2_rf_test = r2_score(y_test, y_pred_rf_test)

print("\nRandom Forest Regressor:")
print(f"Train MSE: {mse_rf_train}, Test MSE: {mse_rf_test}")
print(f"Train R²: {r2_rf_train}, Test R²: {r2_rf_test}")

# 6. Plot feature importance for Random Forest
feature_importances = rf_model.feature_importances_
plt.figure(figsize=(10, 6))
plt.barh(X.columns, feature_importances)
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.title('Feature Importance from Random Forest')
plt.show()

ValueError: could not convert string to float: '2024-05-14'