In [None]:
# ! pip install pandas==1.1.1 joblib==0.14.0 scikit-learn==0.22.1 smote-variants==0.3.13

In [None]:
import os
import glob
import json
import math
from collections import Counter
import pandas as pd
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.ensemble import GradientBoostingRegressor,AdaBoostClassifier 
from sklearn.metrics import mean_squared_error,f1_score
import smote_variants as sv
import joblib

## READING DATA

In [None]:
json_dir_train = 'challenge_data\\train\\'
json_dir_test = 'challenge_data\\test\\'

In [None]:
contents_train = []
contents_test = []

json_pattern_train = os.path.join(json_dir_train, '*.json')
file_list = glob.glob(json_pattern_train)
for file in file_list:
    with open(file,'r') as f:
        temp = json.load(f)
    contents_train.extend(temp)

json_pattern_test = os.path.join(json_dir_test, '*.json')
file_list = glob.glob(json_pattern_test)
for file in file_list:
    with open(file,'r') as f:
        temp = json.load(f)
    contents_test.extend(temp)   

In [None]:
df_train = pd.DataFrame.from_dict(contents_train)
df_test = pd.DataFrame.from_dict(contents_test)

In [None]:
df_train.shape,df_test.shape

## PRE-PROCESSING

In [None]:
# creating a distinct list of technologies mentioned from which a seperate column for a tag is created.
# observed that a certain technology like html, python has more no of challeneges and submission.
def technology_transformation(data,mastertech_list = ["Unknown"]):
    df = data.copy()
    
    def unique_ele(x):
        for tech in x:
            if tech not in mastertech_list:
                mastertech_list.append(tech)
        return x

    df['tags'] = df['tags'].apply(lambda x : unique_ele(x))
    return df,mastertech_list

In [None]:
def preprocessing(df,mastertech_list):
    
    # considering only common features across both active and completed challenges.
    # This make sure that the code will work on both active and completed.
    df = df[['track', 'legacy', 'phases', 'startDate','endDate', 'prizeSets', 'tags','winners']].copy()
    
    df['no_of_tags'] = df['tags'].apply(lambda x:len(x)) # calculating the no of tags as a feature
    df['subTrack'] = df['legacy'].apply(lambda x: x['subTrack']) # calculating the subTrack as a feature
    df['no_of_phases'] = df['phases'].apply(lambda x: len(x)) # calculating the no of phases as a feature
    df['total_prize'] = df['prizeSets'].apply(lambda x: sum([i['value'] for i in x[0]['prizes']])) # calculating the total prize as a feature
    df['no_of_winners'] = df['winners'].apply(lambda x: len(x)) # caculating the taget feature - no of winners
    
    # formatting the startDate and endDate of a challenge
    for col in ['startDate','endDate']:
        df['%s'%col] = pd.to_datetime(df['%s'%col],format='%Y-%m-%d')
    for col in ['startDate','endDate']:
        df['%s'%col] = df['%s'%col].dt.date
    for col in ['startDate','endDate']:
        df['%s'%col] = pd.to_datetime(df['%s'%col],format='%Y-%m-%d')  
    
    # calculating the no of days or the challenege duration
    df['challenge_duration'] = abs(df['endDate'] -  df['startDate']).dt.days
    
    
    ## handling technology list
    for tech in mastertech_list:
        df[tech] = 0

    def find_tech(row):
        for tech in row["tags"]:
            if tech in mastertech_list:
                row[tech]=1
            else:
                row["Unknown"]=1
        return row

    df= df.apply(lambda row : find_tech(row), axis = 1) 

    del df['tags']
    
    df['startDate'] = pd.to_datetime(df['startDate']).dt.date
    df['endDate'] = pd.to_datetime(df['endDate']).dt.date


    df['startDate'] = pd.to_datetime(df['startDate'])
    df['endDate'] = pd.to_datetime(df['endDate'])
    

    ## Only considering the date values to remove or filter unwanted date values
    df['start_day'] = df['startDate'].dt.day_name()
    df['end_day'] = df['endDate'].dt.day_name()

    day_dic ={
        "Sunday":0,
        "Monday":1,
        "Tuesday":2,
        "Wednesday":3,
        "Thursday":4,
        "Friday":5,
        "Saturday":6
    }
    # calculating the day of teh start and end date
    def convert_day(day):
        return day_dic[day]

    df['start_day'] = df['start_day'].apply(lambda x:convert_day(x))
    df['end_day'] = df['end_day'].apply(lambda x:convert_day(x))

    df.drop(['winners','phases','legacy','startDate','endDate','prizeSets'],axis=1,inplace=True)
    
    # one hot encoding of categorical features
    df = pd.get_dummies(df,drop_first=True)
    
    return df

In [None]:
# get the dictinct technology list
df_train,mastertech_list = technology_transformation(df_train)
df_test,mastertech_list = technology_transformation(df_test,mastertech_list)

In [None]:
# preprocess data
df_train = preprocessing(df_train,mastertech_list)
df_test = preprocessing(df_test,mastertech_list)

In [None]:
df_train.shape,df_test.shape

## REGRESSION TRAINING

In [None]:
# get X and y values for training and test set for a regression model
X_train = df_train.drop(['no_of_winners'],axis=1).values
y_train = df_train['no_of_winners'].values
X_test = df_test.drop(['no_of_winners'],axis=1).values
y_test = df_test['no_of_winners'].values

In [None]:
# standardization and normalization of the data
sc = StandardScaler()
n = MinMaxScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_train = n.fit_transform(X_train)
X_test = n.transform(X_test)

In [None]:
# Training the model
GBR = GradientBoostingRegressor(learning_rate=0.4,max_depth=4,max_features = 'auto',min_samples_leaf =8,min_samples_split=5,n_estimators=62)
GBR.fit(X_train, y_train)

## CLASSIFICATION TRAINING

In [None]:
# creating the target variable for the classification model
df_train["no_of_winners"] = df_train["no_of_winners"].apply(lambda x : 1 if x >= 1 else 0)
df_test["no_of_winners"] = df_test["no_of_winners"].apply(lambda x : 1 if x >= 1 else 0)

In [None]:
# get X and y values for training and test set for a classification model
X_train_class = df_train.drop(['no_of_winners'],axis=1).values
y_train_class = df_train['no_of_winners'].values
X_test_class = df_test.drop(['no_of_winners'],axis=1).values
y_test_class = df_test['no_of_winners'].values

In [None]:
print("Target Distribution before Sampling - ")
print("Train Target Distribution :",Counter(y_train_class))
print("Test Target Distribution :",Counter(y_test_class))

In [None]:
# creating a balanced dataset from the imbalanced dataset using oversampling
oversampler= sv.polynom_fit_SMOTE(proportion= 1.5, topology= 'mesh', random_state=40)
X_train_class, y_train_class= oversampler.sample(X_train_class, y_train_class)

In [None]:
# standardization  of the data
sc_class = StandardScaler()
X_train_class = sc_class.fit_transform(X_train_class)
X_test_class = sc_class.transform(X_test_class)

In [None]:
print("Target Distribution after Sampling - ")
print("Train Target Distribution :",Counter(y_train_class))
print("Test Target Distribution :",Counter(y_test_class))

In [None]:
# Training the model
ADA =  AdaBoostClassifier(algorithm='SAMME.R',
                         base_estimator=None,
                         learning_rate=0.2,
                         n_estimators=500,
                         random_state=None)

ADA.fit(X_train_class,y_train_class)

## SAVE MODELS

In [None]:
joblib.dump(GBR, 'regression_model.pkl') 

In [None]:
joblib.dump(ADA, 'classification_model.pkl') 

## EVALUATION

In [None]:
y_pred  = GBR.predict(X_test)

In [None]:
y_pred_class = ADA.predict(X_test_class)

In [None]:
RMSE = math.sqrt(mean_squared_error(y_test, y_pred))
F1 = f1_score(y_test_class, y_pred_class)

In [None]:
print("RMSE :",RMSE)
print("F1 :",F1)

In [None]:
print("FINAL METRIC SCORE : ",F1/(1+RMSE))