In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score


In [5]:
# Load the training and test datasets
train_df = pd.read_csv('unimelb_training.csv', low_memory=False)
test_df = pd.read_csv('unimelb_test.csv', low_memory=False)

# Display the first few rows of the training data
print(train_df.head())
print(test_df.head())

   Grant.Application.ID  Grant.Status Sponsor.Code Grant.Category.Code  \
0                     1             1          NaN                 NaN   
1                     2             1           2B                 10A   
2                     3             1          29A                 10B   
3                     4             1          40D                 10B   
4                     5             0          59C                 10A   

  Contract.Value.Band...see.note.A Start.date  RFCD.Code.1  RFCD.Percentage.1  \
0                               A     8/11/05     280199.0              100.0   
1                               B    11/11/05     280103.0               30.0   
2                               A    14/11/05     321004.0               60.0   
3                               C    15/11/05     270602.0               50.0   
4                               A    16/11/05     260500.0               34.0   

   RFCD.Code.2  RFCD.Percentage.2  ...  Faculty.No..15  With.PHD.15 

In [6]:
# Key columns to use
columns_to_use = [
    'Grant.Application.ID', 'Grant.Status', 'Sponsor.Code', 'Grant.Category.Code',
    'Contract.Value.Band...see.note.A', 'Start.date', 'RFCD.Code.1', 'RFCD.Percentage.1',
    'SEO.Code.1', 'SEO.Percentage.1', 'Person.ID.1', 'Role.1', 'Year.of.Birth.1',
    'Number.of.Successful.Grant.1', 'Number.of.Unsuccessful.Grant.1'
]

train_df = train_df[columns_to_use]
test_df = test_df[[col for col in columns_to_use if col != 'Grant.Status']]  # Test set doesn't have 'Grant.Status'

# Display the updated dataframes
print(train_df.head())
print(test_df.head())


   Grant.Application.ID  Grant.Status Sponsor.Code Grant.Category.Code  \
0                     1             1          NaN                 NaN   
1                     2             1           2B                 10A   
2                     3             1          29A                 10B   
3                     4             1          40D                 10B   
4                     5             0          59C                 10A   

  Contract.Value.Band...see.note.A Start.date  RFCD.Code.1  RFCD.Percentage.1  \
0                               A     8/11/05     280199.0              100.0   
1                               B    11/11/05     280103.0               30.0   
2                               A    14/11/05     321004.0               60.0   
3                               C    15/11/05     270602.0               50.0   
4                               A    16/11/05     260500.0               34.0   

   SEO.Code.1  SEO.Percentage.1  Person.ID.1                Role.1  

In [8]:
# Handle missing values (if any)
train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)

# Convert 'Start.date' to datetime
train_df['Start.date'] = pd.to_datetime(train_df['Start.date'], format='%d/%m/%y')
test_df['Start.date'] = pd.to_datetime(test_df['Start.date'], format='%d/%m/%y')


In [9]:
# Combine training and test data for consistent label encoding
combined_df = pd.concat([train_df.drop(columns=['Grant.Status']), test_df])

# Encode categorical variables
label_encoders = {}
for col in ['Sponsor.Code', 'Grant.Category.Code', 'Contract.Value.Band...see.note.A', 'RFCD.Code.1', 'SEO.Code.1', 'Person.ID.1', 'Role.1']:
    le = LabelEncoder()
    combined_df[col] = le.fit_transform(combined_df[col].astype(str))
    train_df[col] = le.transform(train_df[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))
    label_encoders[col] = le

# Define features and target
X_train = train_df.drop(columns=['Grant.Application.ID', 'Grant.Status', 'Start.date'])
y_train = train_df['Grant.Status']
X_test = test_df.drop(columns=['Grant.Application.ID', 'Start.date'])

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:

# Build and train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [12]:
# Make predictions on the training set
y_train_pred = model.predict(X_train)
y_train_pred_proba = model.predict_proba(X_train)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_train, y_train_pred)
roc_auc = roc_auc_score(y_train, y_train_pred_proba)

print(f'Accuracy on training set: {accuracy}')
print(f'ROC-AUC Score on training set: {roc_auc}')


Accuracy on training set: 0.9968994028479559
ROC-AUC Score on training set: 0.9998980414178654


In [13]:

# Make predictions on the test set
test_pred_proba = model.predict_proba(X_test)[:, 1]

# Create a submission file
submission_df = pd.DataFrame({
    'Grant.Application.ID': test_df['Grant.Application.ID'],
    'Probability of Success': test_pred_proba
})

# Save the submission file
submission_df.to_csv('submission.csv', index=False)