---
# Program to balance the proportions of output labels 

In [1]:
# Loading Libraries
import pandas as pd
import numpy as np
from tabulate import tabulate

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split as ttsplit
import constants

RANDOM_STATE = 53

In [2]:
# Loading Data
df = pd.read_csv(constants.DATA_PATH+'train.csv')
df.shape, df.head()

((903605, 2),
            from-to  score
 0    6320501-27657    0.0
 1  6310893-6295314    1.0
 2  6285646-6303434    3.0
 3   870193-6275811    1.0
 4  6270099-6270025    0.0)

In [3]:
# finding datatypes
df.dtypes

from-to     object
score      float64
dtype: object

In [4]:
#Separating the `from-to` column
temp = np.array(df['from-to'].apply(lambda x: x.split('-')).to_list())
df['userA_id'] = temp[:, 0]
df['userB_id'] = temp[:, 1]
df.head()

Unnamed: 0,from-to,score,userA_id,userB_id
0,6320501-27657,0.0,6320501,27657
1,6310893-6295314,1.0,6310893,6295314
2,6285646-6303434,3.0,6285646,6303434
3,870193-6275811,1.0,870193,6275811
4,6270099-6270025,0.0,6270099,6270025


In [5]:
# Checking types again
df.dtypes

from-to      object
score       float64
userA_id     object
userB_id     object
dtype: object

In [6]:
# Fixing type variables
type_correction = {'score': 'object',
                  }
df = df.astype(type_correction)

 **Important**
 
 - UA_train.ipynb would have covered the lone analysis of train.csv

In [7]:
# Proportions of output labels
df['score'].value_counts(normalize=True)*100

0.0    49.939741
1.0    45.036935
3.0     4.167861
2.0     0.855462
Name: score, dtype: float64

In [8]:
# Subsetting Dataset to balance the output labels
df_label_3 = df[df['score'] == 3]
df_label_2 = df[df['score'] == 2]
df_label_1 = df[df['score'] == 1]
df_label_0 = df[df['score'] == 0]

# getting least freq label count
sample_size = min(df_label_3.shape[0],
    df_label_2.shape[0],
    df_label_1.shape[0],
    df_label_0.shape[0]) - 1

# Randomly selecting observations such that the classes are balanced
df_label_3, _ = ttsplit(df_label_3, random_state=RANDOM_STATE, train_size=sample_size)
df_label_2, _ = ttsplit(df_label_2, random_state=RANDOM_STATE, train_size=sample_size)
df_label_1, _ = ttsplit(df_label_1, random_state=RANDOM_STATE, train_size=sample_size)
df_label_0, _ = ttsplit(df_label_0, random_state=RANDOM_STATE, train_size=sample_size)


In [9]:
# merging the parts
parts = [df_label_3, df_label_2, df_label_1, df_label_0]
df_train = pd.concat(parts)
df_train = shuffle(df_train, random_state=RANDOM_STATE)

In [10]:
# Rechecking proportions
df_train['score'].value_counts(normalize=True)*100

3.0    25.0
2.0    25.0
1.0    25.0
0.0    25.0
Name: score, dtype: float64

In [11]:
# Writing csv
df_train.to_csv(constants.OUTPUT_FILE,
                index=False,
                header=True)