#Run this entire section to train the model with given dataset

In [None]:
# Download the individual dataset
!wget -O cwk_data_20535493.csv "https://drive.google.com/uc?export=download&id=1Bu5Lge14q5fL0uvOGWutwT5yj6y0Vfla"

In [None]:
# Dataset filename
filename = "cwk_data_20535493.csv"

In [None]:
def clean_dataset(filename):

  import pandas as pd
  # Read the csv file
  data = pd.read_csv(filename)

  #count of unknowns for each field and their share in that particular field
  columns_to_drop = []

  for col in data.columns:
      unknown_cnt = data[col].value_counts().get('unknown', 0)  # Use get() to handle the case where 'unknown' doesn't exist
      unknown_pct = (unknown_cnt / data[col].count()) * 100
      print(f'Field: {col}    Unknown Count: {unknown_cnt}   Unknown Percentage: {unknown_pct:.2f}%')
      if unknown_pct > 50:
        columns_to_drop.append(col)

  if len(columns_to_drop) == 0:
      print('No column needs to be fully dropped as the proportion of unknown values are not very high for any given column.')
  else:
     print('\n\nColumns to be dropped:\n {}'.format(columns_to_drop))

  # Check if columns in 'columns_to_drop' exist before dropping
  columns_to_drop_existing = [col for col in columns_to_drop if col in data.columns]

  # Drop the unwanted columns (if they exist)
  if columns_to_drop_existing:
     data_cleansed = data.drop(columns=columns_to_drop_existing, axis=1)
  else:
     data_cleansed = data.copy()

  # Now 'data_cleansed' contains the dataset with unwanted columns dropped (if they existed)

  #Replace the 'unknown' values in remaining fields by the mode of their field
  for col in data_cleansed.columns:
    mode = data_cleansed.mode()[col][0]
    print('Field: {}, Mode: {}'.format(col,mode))
    data_cleansed.loc[data_cleansed[col] == 'unknown',[col]] = mode
    unknown_count = data_cleansed[col].value_counts().get('unknown', 0)
    print(f'Count of unknowns for {col} after replacing: {unknown_count}')

  # Check if 'duration' is in the columns before dropping
  if 'duration' in data_cleansed.columns:
      model_data = data_cleansed.drop('duration', axis=1)
  else:
      model_data = data_cleansed.copy()

  print('Success')

  return model_data


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, make_scorer, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

model_data = clean_dataset(filename)

output_feature = 'y'

# Separate numerical and categorical features
numerical_features = model_data.select_dtypes(include=['int64', 'float64'])
categorical_features = model_data.iloc[:, :-1].select_dtypes(include=['object'])

# Label encode the categorical features
for col in categorical_features.columns:
    categorical_features[col] = categorical_features[col].astype('category').cat.codes

# Concatenate numerical and modified categorical features
features = pd.concat([numerical_features, categorical_features], axis=1)

# Split the data into features (X) and target variable (y)
X = features
y = model_data[output_feature]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Apply Random Forest model with the best parameters
rf_model = RandomForestClassifier(
    class_weight='balanced',
    max_depth=None,
    max_features='log2',
    min_samples_leaf=10,
    min_samples_split=15,
    n_estimators=100,
    random_state=42  # Add a random state for reproducibility
)

# Fit the model to the entire dataset
rf_model.fit(X, y)

# Make predictions on the entire dataset
y_pred_rf = rf_model.predict(X)

# Calculate and display various metrics on the entire dataset
accuracy_rf = accuracy_score(y, y_pred_rf)
precision_rf = precision_score(y, y_pred_rf, pos_label='yes')
recall_rf = recall_score(y, y_pred_rf, pos_label='yes')
f1_rf = f1_score(y, y_pred_rf, pos_label='yes')

# Display Confusion Matrix
cm_rf = confusion_matrix(y, y_pred_rf)
plt.figure(figsize=(5, 4))
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=rf_model.classes_, yticklabels=rf_model.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix (Random Forest)')
plt.show()

# Display the classification report
print('\nClassification Report:')
print(classification_report(y, y_pred_rf, target_names=['no', 'yes']))

In [None]:
def final_model_prediction(test_data_file_name):
  from sklearn.model_selection import train_test_split
  from sklearn.ensemble import RandomForestClassifier
  from sklearn.tree import DecisionTreeClassifier, plot_tree
  from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, make_scorer, confusion_matrix
  import matplotlib.pyplot as plt
  import pandas as pd
  import seaborn as sns

  model_data = clean_dataset(test_data_file_name)

  output_feature = 'y'

  # Separate numerical and categorical features
  numerical_features = model_data.select_dtypes(include=['int64', 'float64'])
  categorical_features = model_data.iloc[:, :-1].select_dtypes(include=['object'])

  # Label encode the categorical features
  for col in categorical_features.columns:
      categorical_features[col] = categorical_features[col].astype('category').cat.codes

  # Concatenate numerical and modified categorical features
  features = pd.concat([numerical_features, categorical_features], axis=1)

  # Split the data into features (X) and target variable (y)
  X = features
  y = model_data[output_feature]

  # Predict the test dataset outcomes using the model trained earlier - rf_model
  y_test_pred = rf_model.predict(X)

  # Calculate and display various metrics on the entire dataset
  accuracy_rf = accuracy_score(y, y_test_pred)
  precision_rf = precision_score(y, y_test_pred, pos_label='yes')
  recall_rf = recall_score(y, y_test_pred, pos_label='yes')
  f1_rf = f1_score(y, y_test_pred, pos_label='yes')

  # Display Confusion Matrix
  cm_rf = confusion_matrix(y, y_test_pred)
  plt.figure(figsize=(5, 4))
  sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=rf_model.classes_, yticklabels=rf_model.classes_)
  plt.xlabel('Predicted')
  plt.ylabel('Actual')
  plt.title('Confusion Matrix (Random Forest)')
  plt.show()

  # Display the classification report
  print('\nClassification Report:')
  print(classification_report(y, y_pred_rf, target_names=['no', 'yes']))




#Upload and enter the test file name

In [None]:
# Upload the test file and provide the name of the file in the quotes below
test_data_file_name = 'your file name please'

#Run this section to get the predictions on test dataset

In [None]:
final_model_prediction(test_data_file_name)