In [11]:
import pandas as pd
import numpy as np
import random
import string
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTENC
from imblearn.over_sampling import RandomOverSampler
import featuretools as ft

In [12]:
template = 'Versions/IMDB_Ver_{}.csv'

In [13]:
df = pd.read_csv('imdb_top_1000.csv')

In [14]:
df = df[['Series_Title', 'Released_Year','Genre', 'Runtime', 'IMDB_Rating', 'Overview']]
df_with_na = df.copy()
df = df.dropna()

In [15]:
df_valid, df_generalization = train_test_split(df, test_size=0.2, random_state=42)
df_valid = df_valid.reset_index().drop(['index'], axis=1)
df_generalization = df_generalization.reset_index().drop(['index'], axis=1)
df_with_na_valid, df_with_na_generalization = train_test_split(df_with_na, test_size=0.2, random_state=42)
df_with_na_valid = df_with_na_valid.reset_index().drop(['index'], axis=1)
df_with_na_generalization = df_with_na_generalization.reset_index().drop(['index'], axis=1)

In [16]:
version_counter = 0
problem_sets = []

In [17]:
def save_version(template, df, version_counter):
    version_id = template.format(str(version_counter))
    df.to_csv(version_id, index=False)
    version_id = version_id[9:]
    return version_id, version_counter + 1

In [19]:
valid_version_id, version_counter = save_version(template, 
                                                 df_valid, 
                                                 version_counter)
generalization_version_id, version_counter = save_version(template, 
                                                          df_generalization, 
                                                          version_counter)
valid_with_na_version_id, version_counter = save_version(template, 
                                                 df_with_na_valid, 
                                                 version_counter)
generalization_with_na_version_id, version_counter = save_version(template, 
                                                          df_with_na_generalization, 
                                                          version_counter)

In [20]:
def generate_versions_col_no_new_T(df_valid, valid_version_id,
                                   df_generalization, generalization_version_id,
                                   problem_sets, version_counter,
                                   add_column_func, setup):
    df_prime_valid = add_column_func(df_valid)
    df_prime_generalization = add_column_func(df_generalization)
    
    df_prime_valid_version, version_counter = save_version(template, 
                                                           df_prime_valid, 
                                                           version_counter)    
    df_prime_generalization_version, version_counter = save_version(template, 
                                                                    df_prime_generalization, 
                                                                    version_counter)    
    
    problem_sets.append({'T_validation': valid_version_id, 
                         'T_prime_validation': df_prime_valid_version, 
                         'T_generalization': generalization_version_id, 
                         'T_prime_generalization': df_prime_generalization_version,
                          'Setup': setup})
    return version_counter, problem_sets

In [21]:
def generate_versions_col_new_T(df_valid, valid_version_id,
                                   df_generalization, generalization_version_id,
                                   problem_sets, version_counter,
                                   tras_column_func, setup):
    df_valid, df_prime_valid = tras_column_func(df_valid)
    df_generalization, df_prime_generalization = tras_column_func(df_generalization)
    
    valid_version_id, version_counter = save_version(template, 
                                                     df_valid, 
                                                     version_counter)
    df_prime_valid_version, version_counter = save_version(template, 
                                                           df_prime_valid, 
                                                           version_counter)
    generalization_version_id, version_counter = save_version(template, 
                                                              df_generalization, 
                                                              version_counter)
    df_prime_generalization_version, version_counter = save_version(template, 
                                                                    df_prime_generalization, 
                                                                    version_counter)    
    
    problem_sets.append({'T_validation': valid_version_id, 
                         'T_prime_validation': df_prime_valid_version, 
                         'T_generalization': generalization_version_id, 
                         'T_prime_generalization': df_prime_generalization_version,
                          'Setup': setup})
    return version_counter, problem_sets

In [22]:
def col_addition_removal_row_addition_removal_transformations1(df):
    noise_rate_for_nans = 0.8
    noise_rate_for_duplicate_column = 0.15
    bootstrap_size = 5
    sampled_size = 5
    df_prime = df.copy()

    df['Series_Title_exact_copy'] = df['Series_Title']
    df['Series_Title_with_noise'] = df['Series_Title']
    noise_size = int(noise_rate_for_duplicate_column * len(df))
    sampled_data = df['Series_Title_with_noise'].sample(n=noise_size, random_state=1).tolist()
    random_ixs = random.sample(range(len(df)), noise_size)
    df['Series_Title_with_noise'].iloc[random_ixs] = sampled_data
    df['Genre_with_nan_noise'] = df['Genre']
    noise_size = int(noise_rate_for_nans * len(df))
    random_ixs = random.sample(range(len(df)), noise_size)
    df['Genre_with_nan_noise'].iloc[random_ixs] = pd.NaT
    
    df_prime = df_prime[df_prime['IMDB_Rating'] >= 8]
    
    df_prime = df_prime.reset_index()
    df_prime['orig_index'] = df_prime['index']
    df_prime = df_prime.drop(['index'], axis=1)
    
    bootstrapped_data = df_prime.sample(n=bootstrap_size, random_state=1)
    bootstrapped_data['orig_index'] = list(range(len(df), len(df) + bootstrap_size))
    bootstrapped_data.index = list(range(len(df), len(df) + bootstrap_size))
    df_prime = df_prime.append(bootstrapped_data)
    
    
    df_prime.loc[:, 'Number_of_genres'] = df_prime['Genre'].str.count(',') + 1
    df_prime.loc[:, 'Title_len'] = df_prime['Series_Title'].str.len()
    sep = random.sample(['.', ' ', '\n', ',', '?'], 1)[0]
    df_prime.loc[:, 'sep_Overview'] = df_prime['Overview'].str.count(sep)
    # Numeric Operations:
    df_prime.loc[:, 'IMDB_Rating_power_2_plus_price'] = (df_prime['IMDB_Rating'] * df_prime['IMDB_Rating']) + df_prime['IMDB_Rating']
    math_trans = [np.log, np.reciprocal, np.sqrt, np.reciprocal]
    df_prime.loc[:, 'Rating_trans'] = random.sample(math_trans, 1)[0](df_prime['IMDB_Rating'])
    
    df_prime.loc[:, 'IMDB_Rating_norm_sum'] = df_prime['IMDB_Rating'] / df_prime['IMDB_Rating'].sum()
    df_prime.loc[:, 'IMDB_Rating_sub_mean'] = df_prime['IMDB_Rating'] - df_prime['IMDB_Rating'].mean()
    df_prime.loc[:, 'IMDB_Rating_norm_minmax'] = (df_prime['IMDB_Rating'] - df_prime['IMDB_Rating'].min()) / \
                                           (df_prime['IMDB_Rating'].max() - df_prime['IMDB_Rating'].min())
    df_prime['is_top_movie'] = 0
    df_prime.loc[df_prime['IMDB_Rating'] > random.uniform(8, 10), 'is_top_movie'] = 1
    df_prime.loc[:, 'Rating_on_1_to_2_scale'] = pd.qcut(df_prime['IMDB_Rating'], 2, labels=False)
    
    df_prime.loc[:, 'random_score'] = np.random.randint(0, 100, size=(len(df_prime), 1))
    
    df_prime["Genre_code"] = df_prime["Genre"].astype('category').cat.codes
    
    new_columns_order = [col for col in df_prime.columns if col != 'orig_index'] + ['orig_index', ]
    df_prime = df_prime[new_columns_order]
    return df, df_prime

In [23]:
version_counter, problem_sets = generate_versions_col_new_T(df_with_na_valid.copy(), valid_with_na_version_id,
                                                               df_with_na_generalization.copy(), generalization_with_na_version_id,
                                                               problem_sets, version_counter,
                                                               col_addition_removal_row_addition_removal_transformations1, 'mixed_mixed_mixed_rows_mixed_columns')

  df_prime = df_prime.append(bootstrapped_data)
  df_prime = df_prime.append(bootstrapped_data)


In [24]:
pd.DataFrame(problem_sets).to_csv('Versions/problem_sets.csv', index=False)