# Random Forest on Astro First Movie Purchase

In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

### Load and Prepare Data

In [3]:
df = pd.read_csv(r'C:\Users\Shireen\Desktop\Predictive analysis\Training data for Astro First Munafik.csv')

# Remove duplicated accounts
df.drop_duplicates(inplace=True)
# Impute missing values in Gender with 'Unknown'
df['Gender'] = df['Gender'].fillna('Unknown')

### Initialise the Random Forest Model

In [4]:
# Set the seed
np.random.seed(415)

# Initialize label encoder
label_encoder = preprocessing.LabelEncoder()

# Convert categorical variables to numeric
df['Race'] = label_encoder.fit_transform(df['Race'])
df['Gender'] = label_encoder.fit_transform(df['Gender'])

# Initialize the model
rf_model = RandomForestClassifier(n_estimators=1000, # Number of trees
                                  max_features=2,    # Num features considered
                                  oob_score=True)    # Use OOB scoring*

### Train the Model

In [5]:
# Training features
features = ["Race","Gender","Astro First 2015 Q4 purchases","Astro First 2016 Q1 purchases",\
            "Astro Best 2015 Q4 purchases","Astro Best 2016 Q1 purchases","Having Super Pack","Having AOTG"]

# Train the model
rf_model.fit(X=df[features],
             y=df["Purchased Munafik"])

print("OOB accuracy: ")
print(rf_model.oob_score_)

OOB accuracy: 
0.748821744677


Check feature importance for the random forest model. This tells us the most useful variables that contributes in our model.

In [6]:
for feature, imp in zip(features, rf_model.feature_importances_):
    print(feature, imp)

('Race', 0.43473136395198531)
('Gender', 0.072708199260066872)
('Astro First 2015 Q4 purchases', 0.15101339995057131)
('Astro First 2016 Q1 purchases', 0.1141107582827169)
('Astro Best 2015 Q4 purchases', 0.088913127798355154)
('Astro Best 2016 Q1 purchases', 0.11629287710444133)
('Having Super Pack', 0.012512376681129462)
('Having AOTG', 0.0097178969707332283)


Feature importance tells us that Race is the most useful variable. It also tells us that "Having AOTG" and "Having Super Pack" are not that important and can be eliminated.

In [7]:
# Training features
features = ["Race","Gender","Astro First 2015 Q4 purchases","Astro First 2016 Q1 purchases",\
            "Astro Best 2015 Q4 purchases","Astro Best 2016 Q1 purchases"]

# Train the model
rf_model.fit(X=df[features],
             y=df["Purchased Munafik"])

print("OOB accuracy: ")
print(rf_model.oob_score_)

OOB accuracy: 
0.750004957596


It shows that eliminating these features slightly improve the prediction accuracy. Therefore only these variables will be used to predict the test data.

## Test Set

In [40]:
df_test = pd.read_csv(r'C:\Users\Shireen\Desktop\Predictive analysis\Data to predict for Astro First Munafik.csv')

# Impute missing values in Gender with 'Unknown'
df_test['Gender'] = df_test['Gender'].fillna('Unknown')

In [66]:
# Convert categorical variables to numeric
df_test['Race'] = label_encoder.fit_transform(df_test['Race'])
df_test['Gender'] = label_encoder.fit_transform(df_test['Gender'])

# Make test set predictions
test_preds = rf_model.predict(X= df_test[features])
df_test['Purchased Munafik'] = pd.DataFrame(test_preds)
df_test.to_csv(r'C:\Users\Shireen\Desktop\Predictive analysis\Data to predict for Astro First Munafik - Submission.csv')