Project 7

In [None]:
import pandas as pd

# Load historical data and potential customers list
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')


In [None]:
# Check for missing values
print(train_data.isnull().sum())

# Visualize distributions
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x='target', data=train_data)
plt.show()


In [None]:
# Handle missing values
train_data.fillna(train_data.median(), inplace=True)

# Treat outliers
Q1 = train_data.quantile(0.25)
Q3 = train_data.quantile(0.75)
IQR = Q3 - Q1
train_data = train_data[~((train_data < (Q1 - 1.5 * IQR)) | (train_data > (Q3 + 1.5 * IQR))).any(axis=1)]


In [None]:
 from imblearn.over_sampling import SMOTE

# Separate features and target
X = train_data.drop('target', axis=1)
y = train_data['target']

# Apply SMOTE
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)


In [None]:
from sklearn.preprocessing import StandardScaler

# Standardize numerical features
scaler = StandardScaler()
X_resampled_scaled = scaler.fit_transform(X_resampled)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_resampled_scaled, y_resampled, test_size=0.2, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Assess model performance
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize and train GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get best parameters and retrain the model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Make predictions with the best model
y_pred = best_model.predict(X_test)

# Assess model performance
accuracy = accuracy_score(y_test, y_pred)
print("Best Model Accuracy:", accuracy)


Conclusion