<a href="https://colab.research.google.com/github/snehithjk/csc-566-public/blob/main/CSC_566_Competition_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BENCHMARK ONE TEST
## NAME: [Your Name]


## Instructions
1. Copy the model you want to use in the cell below.
2. Under create_model(), change the return value into your model with any parameter’s default value.
3. Save and share the link (Please make it viewable for anyone with the link) in the google form.


## Rules
- Your model must complete the test suit under 7 minutes.
- The model must have a `fit` and `predict` method.
- The `fit` and `predict` methods should work with Pandas DataFrames/Series


Dataset lengths (rows):  {'video_games': 16663, 'life': 2928, 'flare': 1066}


## *TODO*: Put your model code in the cell under 

In [None]:
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils import resample
from sklearn.metrics import mean_absolute_error
import numpy as np
import pandas as pd
#import other libs you use...
class ExampleRegressor:
  pass
#your model:

## *TODO*: Initialize your model with parameters

In [None]:
def create_model() -> RegressorMixin:
  """
  Put any initialization logic for your model in this function.
  No need to `fit` or `predict`
  Initialize with all hyperparameters here
  Returns the initialized model
  """

  return ExampleRegressor(hyperparameter="something")

## Get Data

In [None]:
# Downloading the datasets
!wget -q https://raw.githubusercontent.com/michaelmoschitto/benchmark-team-2/main/data/video_games_data.csv
!wget -q https://raw.githubusercontent.com/michaelmoschitto/benchmark-team-2/main/data/life_expectancy_data.csv

In [None]:
# Imports
import sklearn
import pandas as pd
import numpy as np

### Preprocessing Functions

In [None]:
def preprocess_video_games_df(path_to_video_games_csv):
  """
  This function will preprocesses the video games DF and returns the entire 
  X and y dataframes. 

  Input:
    path_to_video_games_csv (string): Path to csv
  Output:
    (X, y): Tuple of dataframes
      -> X : Dataframe with 5 columns of String Type
      -> y : Series with 1 column of float64 type
  """

  df = pd.read_csv(path_to_video_games_csv)
  df = df.drop(['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Critic_Score',
            'Critic_Count', 'User_Score', 'User_Count', 'Developer', 'Rating'], axis=1)
  
  # Set the year to categorical
  df['Year_of_Release'] = df['Year_of_Release'].apply(str)

  # Get rid of na rows
  df.dropna(inplace=True)

  # Set target variable
  y = df.pop('Global_Sales')
  return df, y

In [None]:
def preprocess_life_df(path_to_life_csv):
  """
  This function will preprocesses the life expectancy DF and returns the 
  X and y dataframes. 

  Input:
    path_to_life_csv (string): Path to csv
  Output:
    (X, y): Tuple of dataframes
      -> X : Dataframe with 7 columns of String Type
      -> y : Series with 1 column of float64 type
  """
  df = pd.read_csv(path_to_life_csv)

  # get y attrib
  y = df['Life expectancy ']

  # Type fixing
  df['Country'] = df['Country'].astype(str)
  df['Year'] = df['Year'].apply(str)
  df['Status'] = df['Status'].astype(str)

  # convert numerics to ranges
  per1000_bins = [i for i in range(0, 1001, 100)]
  per1000_labels = ['({i}-{j}]'.format(i=i, j=i+100) for i in per1000_bins[:-1]]
  per100_bins = [i for i in range(0, 101, 10)]
  per100_labels = ['({i}-{j}]'.format(i=i, j=i+10) for i in per100_bins[:-1]]
  per1_bins = [round(x * 0.1, 1) for x in range(0, 11)]
  per1_labels = ['({i}-{j}]'.format(i=i, j=round(i+.1, 1)) for i in per1_bins[:-1]]
  
  df['Adult Mortality'] = pd.cut(df['Adult Mortality'], bins=per1000_bins, 
                                 labels=per1000_labels).astype(str)
  df['Hepatitis B %immun'] = pd.cut(df['Hepatitis B'], bins=per100_bins, 
                                 labels=per100_labels).astype(str)
  df['BMI'] = pd.cut(df[' BMI '], bins=per100_bins, 
                                 labels=per100_labels).astype(str)
  df['Polio %immun'] = pd.cut(df['Polio'], bins=per100_bins, 
                                 labels=per100_labels).astype(str)
  df['Diphtheria %immun'] = pd.cut(df['Diphtheria '], bins=per100_bins, 
                                 labels=per100_labels).astype(str)
  df['Income composition of resources'] = pd.cut(df['Income composition of resources'], bins=per1_bins, 
                                 labels=per1_labels).astype(str)
  
  # selected features
  features = ['Country', 'Year', 'Status', 'Adult Mortality', 'Hepatitis B %immun',
              'BMI', 'Polio %immun', 'Diphtheria %immun', 'Income composition of resources', 'Life expectancy ']

  # only keeping main features
  df = df.loc[:,features]

  # drop missing values
  df = df.dropna()
  y_feature = "Life expectancy "
  y = df[y_feature]
  df = df.drop(y_feature, axis=1)
  return df, y

In [None]:
def preprocess_flare_df(**kwargs):
  df = pd.read_csv(kwargs['link'], sep=kwargs['sep'], names=kwargs['names'])

  # drop this because there is no variance
  df = df.drop(columns=['largest-spotarea'])

  # drop missing values
  df = df.dropna()

  # get y attrib
  y = df.pop('C')
  
  return df, y

In [None]:
# Get data
datasets = {
    "video_games": preprocess_video_games_df('/content/video_games_data.csv'),
    "life": preprocess_life_df('/content/life_expectancy_data.csv'),
    "flare": preprocess_flare_df(link='https://archive.ics.uci.edu/ml/machine-learning-databases/solar-flare/flare.data2', sep=" ", 
                names = ['class','size','spot','activity','evolution','prev24hr','histcomplex','histcomplexsundisk','area','largest-spotarea','C'])
}

print("Dataset lengths: ", {name: len(ds[0]) for name, ds in datasets.items()})

Dataset lengths:  {'video_games': 16663, 'life': 2928, 'flare': 1066}


## Testing

### Accuracy
Measures the train and test loss between actual values and predicted values using **Mean Squared Error**

### Performance
Measures the amount of time it takes to `fit` and `predict` with your model.

In [None]:
# Testing Parameters
SEED = 42
TEST_SIZE = 0.2
TIMING_ITERATIONS = 3

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import timeit
np.random.seed(SEED)
results = {}

for dataset_name, (X,y) in datasets.items():
  n_iterations = 1 if dataset_name == "video_games" else TIMING_ITERATIONS
  model = create_model()
  x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=TEST_SIZE, random_state=SEED, shuffle=True)
  preds = None
  def benchmark():
      global preds
      model.fit(x_train, y_train)
      preds = model.predict(x_test)
  perf = (
      timeit.timeit(
          benchmark, number=n_iterations
      )
      / n_iterations
  )
  results[dataset_name] = {"mse": mean_squared_error(y_test, preds), "time/s": perf}

print("Results: ", results)


Results:  {'video_games': {'mse': 4.2254189512510925, 'time': 340.2908347370003}, 'life': {'mse': 7.579101963786914, 'time': 6.352132163333408}, 'flare': {'mse': 0.6467976632721417, 'time': 0.5988086340000033}}
