<a href="https://colab.research.google.com/github/pratik7191/random-forest-streamlit-app/blob/main/Final_Code_Book_Application_Development.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd
from csv import QUOTE_MINIMAL

# Load the dataset with error handling
data_path = "/content/Capstone Trade-in Dataset 121124 (1).csv"
try:
    data = pd.read_csv(data_path, quoting=QUOTE_MINIMAL, on_bad_lines='skip')  # Skip problematic lines
    # Print sample and column details
    print("Dataset Columns:", data.columns)
    print("Sample Rows:", data.head())
except Exception as e:
    print("Error reading the file:", e)

Dataset Columns: Index(['tradein_ref_id', 'trade_in_status', 'trade_in_datetime',
       'diagnosis_done', 'offer_program_name', 'IMEI', 'IMEI_upgraded_device',
       'trade_in_price_offered_to_the_customer',
       'trade_in_price_offered_to_the_retailer', 'device_grade',
       'liquidator_type', 'requote_value', 'offer_value', 'new_device_brand',
       'liq_p_flag', 'partner_id', 'product_id', 'psl_id',
       'old_device_brand_id', 'dt_of_purchase', 'is_under_warranty',
       'warranty_till_dt', 'name_of_firm', 'service_location_name', 'zip_code',
       'brand_name', 'product_name'],
      dtype='object')
Sample Rows:   tradein_ref_id trade_in_status trade_in_datetime  diagnosis_done  \
0   GKAL1ZXVZHDO          Failed           21:34.0               1   
1   IGG4VYQOYU6J          Failed           31:12.0               1   
2   8VUMFNU7WH1P          Failed           51:41.0               1   
3   ARYFHWQEDROQ          Failed           54:46.0               1   
4   MMSBWLHGR8B5

In [None]:
data.shape

(1048575, 27)

In [None]:
# Convert date/time columns
if 'trade_in_datetime' in data.columns:
    data['trade_in_datetime'] = pd.to_datetime(data['trade_in_datetime'], errors='coerce')

# Ensure numerical columns are in correct type
for col in ['trade_in_price_offered_to_the_customer', 'trade_in_price_offered_to_the_retailer']:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Categorical encoding (Example: device_grade)
if 'device_grade' in data.columns:
    data['device_grade_encoded'] = data['device_grade'].astype('category').cat.codes

In [None]:
# Drop irrelevant columns
columns_to_drop = ['tradein_ref_id', 'IMEI', 'name_of_firm', 'service_location_name', 'zip_code', 'product_name']
data = data.drop(columns=[col for col in columns_to_drop if col in data.columns])

# Check the remaining columns
print("Remaining Columns:", data.columns)

Remaining Columns: Index(['trade_in_status', 'trade_in_datetime', 'diagnosis_done',
       'offer_program_name', 'IMEI_upgraded_device',
       'trade_in_price_offered_to_the_customer',
       'trade_in_price_offered_to_the_retailer', 'device_grade',
       'liquidator_type', 'requote_value', 'offer_value', 'new_device_brand',
       'liq_p_flag', 'partner_id', 'product_id', 'psl_id',
       'old_device_brand_id', 'dt_of_purchase', 'is_under_warranty',
       'warranty_till_dt', 'brand_name', 'device_grade_encoded'],
      dtype='object')


In [None]:
# Fill missing values in numerical columns with mean
for col in data.select_dtypes(include=['float64', 'int64']).columns:
    data[col].fillna(data[col].mean(), inplace=True)

# Fill missing values in categorical columns with mode
for col in data.select_dtypes(include=['object']).columns:
    data[col].fillna(data[col].mode()[0], inplace=True)

# Drop columns with more than 80% missing values
threshold = 0.8
data = data.loc[:, data.isnull().mean() < threshold]

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

# Specify the target column
target_column = 'trade_in_status'

# Ensure the target column is numeric
data[target_column] = data[target_column].astype('category').cat.codes

# Separate features and target
X = data.drop(columns=[target_column])
y = data[target_column]

# Convert datetime columns to numeric features
if 'trade_in_datetime' in X.columns:
    X['trade_in_datetime'] = pd.to_datetime(X['trade_in_datetime'], errors='coerce')
    X['trade_in_year'] = X['trade_in_datetime'].dt.year
    X['trade_in_month'] = X['trade_in_datetime'].dt.month
    X['trade_in_day'] = X['trade_in_datetime'].dt.day
    X = X.drop(columns=['trade_in_datetime'])

# Encode categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns
for col in categorical_columns:
    X[col] = X[col].astype('category').cat.codes

# Fill missing values with 0
X = X.fillna(0)

# Verify that X is fully numeric
print("Data types after preprocessing:\n", X.dtypes)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logreg))
print("Classification Report:\n", classification_report(y_test, y_pred_logreg))

# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

# Gradient Boosting
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)
print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_gb))
print("Classification Report:\n", classification_report(y_test, y_pred_gb))


Data types after preprocessing:
 diagnosis_done                              int64
offer_program_name                           int8
IMEI_upgraded_device                        int16
trade_in_price_offered_to_the_customer    float64
trade_in_price_offered_to_the_retailer    float64
device_grade                                 int8
liquidator_type                              int8
requote_value                             float64
offer_value                               float64
new_device_brand                             int8
liq_p_flag                                  int64
partner_id                                  int64
product_id                                  int64
psl_id                                      int64
old_device_brand_id                         int64
dt_of_purchase                              int16
is_under_warranty                         float64
warranty_till_dt                            int16
brand_name                                   int8
device_grade_enco

In [None]:
# Calculate training and testing accuracies for each model
logreg_train_accuracy = accuracy_score(y_train, logreg.predict(X_train))
logreg_test_accuracy = accuracy_score(y_test, y_pred_logreg)

rf_train_accuracy = accuracy_score(y_train, rf.predict(X_train))
rf_test_accuracy = accuracy_score(y_test, y_pred_rf)

gb_train_accuracy = accuracy_score(y_train, gb.predict(X_train))
gb_test_accuracy = accuracy_score(y_test, y_pred_gb)

# Prepare results as a DataFrame
results_summary = {
    "Model": ["Logistic Regression", "Random Forest", "Gradient Boosting"],
    "Train Accuracy": [logreg_train_accuracy, rf_train_accuracy, gb_train_accuracy],
    "Test Accuracy": [logreg_test_accuracy, rf_test_accuracy, gb_test_accuracy]
}

results_df = pd.DataFrame(results_summary)

# Display the results
print("Model Training and Testing Accuracy:")
print(results_df)

Model Training and Testing Accuracy:
                 Model  Train Accuracy  Test Accuracy
0  Logistic Regression        0.950114       0.950628
1        Random Forest        0.995320       0.982519
2    Gradient Boosting        0.982232       0.982119


In [None]:
import pickle

# Specify the path to save the model
pickle_file_path = 'random_forest_model.pkl'

# Save the Random Forest model
with open(pickle_file_path, 'wb') as file:
    pickle.dump(rf, file)

print(f"Random Forest model saved to: {pickle_file_path}")


Random Forest model saved to: random_forest_model.pkl


In [None]:
# Install Streamlit and ngrok for Google Colab
!pip install streamlit ngrok


Collecting streamlit
  Downloading streamlit-1.41.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting ngrok
  Downloading ngrok-1.4.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.41.1-py2.py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ngrok-1.4.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none

In [None]:
%%writefile app.py

import streamlit as st
import pickle
import numpy as np

# Load the trained Random Forest model
with open('random_forest_model.pkl', 'rb') as file:
    model = pickle.load(file)

# Title of the web app
st.title("Trade-In Status Prediction App")

# Input fields for the top 8 features
st.header("Input the Device Details")

# Input fields based on the top 8 features
offer_value = st.number_input("Offer Value", min_value=0.0, step=0.01)
is_under_warranty = st.number_input("Is Under Warranty (0 or 1)", min_value=0, max_value=1, step=1)
device_grade = st.number_input("Device Grade (e.g., 1, 2, 3)", min_value=0, step=1)
new_device_brand = st.number_input("New Device Brand (e.g., 1, 2, 3)", min_value=0, step=1)

# Predict button
if st.button("Predict Trade-In Status"):
    # Create feature array for prediction
    features = np.array([
        offer_value,
        is_under_warranty,
        device_grade,
        new_device_brand
    ]).reshape(1, -1)

    # Predict using the model
    prediction = model.predict(features)

    # Map numerical predictions to status labels
    status_mapping = {0: "Failed", 1: "Interim", 2: "Success"}  # Modify as per your dataset
    st.success(f"Predicted Trade-In Status: {status_mapping.get(prediction[0], 'Unknown')}")


Overwriting app.py


In [None]:
#INFO: Running on local URL: http://localhost:8501
#INFO: Exposing the app to the internet at: https://<your-ngrok-url>.ngrok.io


SyntaxError: invalid syntax (<ipython-input-55-d44911f69250>, line 1)

In [None]:
!pip install streamlit pyngrok



In [None]:
from pyngrok import ngrok

ngrok.set_auth_token("<2rMEZFMPX5bJRVE9mSZMgHxoWFH_5EobCMd5nRg9e3eeE2FLv>")


In [None]:
# Start the Streamlit app
!streamlit run app.py &

# Use ngrok to expose the app
from pyngrok import ngrok
public_url = ngrok.connect(8501)  # 8501 is the default port for Streamlit
print(f"Streamlit app is running at: {public_url}")


In [None]:
import joblib

# Model results stored in variables
lr_model_results = {'accuracy': 0.964981, 'precision': 0.965, 'recall': 0.965}
rf_model_results = {'accuracy': 0.981489, 'precision': 0.981, 'recall': 0.981}
gb_model_results = {'accuracy': 0.981365, 'precision': 0.981, 'recall': 0.981}


# Store results for comparison
model_results = [
    {'model_name': 'Logistic Regression', **lr_model_results},
    {'model_name': 'Random Forest', **rf_model_results},
    {'model_name': 'Gradient Boosting', **gb_model_results}
]

# Compare models and find the best one
best_model = max(model_results, key=lambda x: x['accuracy'])

# Print all results
print("Model Comparison:\n")
for result in model_results:
    print(f"Model: {result['model_name']} / Accuracy: {result['accuracy']} / Precision: {result['precision']} / Recall: {result['recall']}")

# Print the best model
print("\nBest Model:")
print(f"Model: {best_model['model_name']} / Accuracy: {best_model['accuracy']} / Precision: {best_model['precision']} / Recall: {best_model['recall']}")

# Save the best model's details (use the actual model object for XGBoost or others)
joblib.dump(best_model, 'Best_model.pkl')

# Load the saved best model details for verification
loaded_model = joblib.load('Best_model.pkl')
print("\nLoaded Best Model Data:", loaded_model)


Model Comparison:

Model: Logistic Regression / Accuracy: 0.964981 / Precision: 0.965 / Recall: 0.965
Model: Random Forest / Accuracy: 0.981489 / Precision: 0.981 / Recall: 0.981
Model: Gradient Boosting / Accuracy: 0.981365 / Precision: 0.981 / Recall: 0.981

Best Model:
Model: Random Forest / Accuracy: 0.981489 / Precision: 0.981 / Recall: 0.981

Loaded Best Model Data: {'model_name': 'Random Forest', 'accuracy': 0.981489, 'precision': 0.981, 'recall': 0.981}
