In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Part 1
**Q1**

In [None]:
!rm *.csv

from google.colab import files
print("Please load the raw csv data")
uploadedData = files.upload()
filename = list(uploadedData)[0]

In [None]:
dataset = pd.read_csv(filename)

In [None]:
# from google.colab import files /# use for debug only
# outputPath = "saved_file.csv"
# dataset.to_csv(outputPath)
# files.download(outputPath) 

In [None]:
dataset

**Q3**

In [None]:
int_cols = {'AgeGroup', 'ConversatiosPerDay', 'DisciplineScore', 'HappinessScore', 'MedicalCarePerYear', 'NrCousins', 'StepsPerYear'}
for col_name in int_cols:
  dataset[col_name] = dataset[col_name].astype('Int64') # we chose 'Int64' becuase it is the only 'int' type which allows to use none vals


dataset.Address = dataset.Address.astype('string')
dataset.BloodType = dataset.BloodType.astype('category')
dataset.DateOfPCRTest = dataset.DateOfPCRTest.astype('datetime64[ns]')
dataset.Job = dataset.Job.astype('string')
dataset.Self_declaration_of_Illness_Form = dataset.Self_declaration_of_Illness_Form.astype('string')
dataset.Sex = dataset.Sex.astype('category')
dataset.Virus = dataset.Virus.astype('category')
dataset.SpreadLevel = dataset.SpreadLevel.astype('category')
dataset.Risk = dataset.Risk.astype('category')

dataset.dtypes


**before splitting our data to subsets, we first want to modify some of them a bit  (as part of Q 2.6)**



**the next few code-cells create catagorial columns based on numaric scale from catagorial columns based on string into**

In [None]:
dic_tmp = {'low': 1, 'medium': 2, 'high': 3}
tmp_col = dataset.SpreadLevel.apply(lambda row: dic_tmp[row]).astype('Int64')
dataset.insert(loc=dataset.columns.get_loc('SpreadLevel')+1, column='SpreadLevel_scala', value=tmp_col)
tmp_col = dataset.Risk.apply(lambda row: dic_tmp[row]).astype('Int64')
dataset.insert(loc=dataset.columns.get_loc('Risk')+1, column='Risk_scala', value=tmp_col)

In [None]:
tmp_col = dataset.CurrentLocation.apply(lambda row: np.nan if type(row) != str else row.split('\'')[3]).astype(float)
dataset.insert(loc=dataset.columns.get_loc('CurrentLocation')+1, column='y_location', value=tmp_col)
tmp_col = dataset.CurrentLocation.apply(lambda row: np.nan if type(row) != str else row.split('\'')[1]).astype(float)
dataset.insert(loc=dataset.columns.get_loc('CurrentLocation')+1, column='x_location', value=tmp_col)

In [None]:
tmp_col = dataset.DateOfPCRTest.apply(lambda row: None if row is None else row.month).astype('Int64')
dataset.insert(loc=dataset.columns.get_loc('DateOfPCRTest')+1, column='MonthOfPCRTest', value=tmp_col)

In [None]:
tmp_col = pd.cut(dataset['HouseholdExpenseParkingTicketsPerYear'],7,labels=[1,2,3,4,5,6,7]).astype('Int64')
dataset.insert(loc=dataset.columns.get_loc('HouseholdExpenseParkingTicketsPerYear')+1, column='TicketsPerYearGroup', value=tmp_col) 

**Q 6**

notice we did it before Q 4 in purpose

In [None]:
# this code-cell creates new, more easy to use, columns based on Self_declaration_of_Illness_Form column
tmp_col = dataset.Self_declaration_of_Illness_Form.apply(lambda row: [] if type(row) != str else sorted([x.strip() for x in row.split(';')]))
dataset.insert(loc=dataset.columns.get_loc('Self_declaration_of_Illness_Form')+1, column='Symptoms_list', value=tmp_col)
all_sym = set()
for row in dataset.Symptoms_list:
  all_sym = all_sym.union(row)

for sym in all_sym:
  tmp_col = dataset.Symptoms_list.apply(lambda row: sym in row)
  dataset.insert(loc=dataset.columns.get_loc('Symptoms_list')+1, column="Is_having_"+sym, value=tmp_col)

In [None]:
# this code-cell extracts the region from the address column
def extract_only_city(s):
  if (type(s) != str):
    return None
  s = s.split('\r')[-1].split(',')[0]
  for letter in s:
    if letter.isdigit():
      s = s.replace(letter,'')
  return s.strip()

tmp_col = dataset.Address.apply(lambda row: extract_only_city(row)).astype(str)
dataset.insert(loc=dataset.columns.get_loc('Address')+1, column='Region', value=tmp_col)

# Part 2

**we finished to modify the basic column we will work on so now we can split our data to test, train and validation subsets**

**Q 4**

In [None]:
# Q 2.4 - spliting data to test, train and validation subsets
from sklearn.model_selection import train_test_split
tmp, test = train_test_split(dataset, test_size=0.2, random_state=12)
train, validation =  train_test_split(tmp, test_size=0.25, random_state=12)

# we are going to modify slices of dataset so pandas see it as 'SettingWithCopyWarning'. those msg are not relevant and annoying so...
pd.options.mode.chained_assignment = None  # default='warn'


(for Q 6 look at couple blocks back)

##Missing data
here we study our data, in particular its distribution, in order to choose how to fill missing data


**Q 7 - remove attributes**

**Manual removing attributes with high percentage of divergence**

In [None]:
high_divergence = [col for col in ['Region', 'Job'] if train[col].nunique()/train.shape[0] < 0.8]
print(high_divergence)
for col in high_divergence:
  train.drop(col, axis='columns', inplace=True)

['Region', 'Job']


In [None]:
train.drop('Address', axis='columns', inplace=True) # due Region did not found useful...

**Manual removing attributes with high percentage of missing data**

In [None]:
many_deficiencies = [col for col in train.columns if train[col].notnull().sum()/train.shape[0] < 0.8]
print(many_deficiencies)
for col in many_deficiencies:
  train.drop(col, axis='columns', inplace=True)

['PCR_11', 'PCR_15']


**Q 8 - here we fill missing cells of columns with normal distribution with their means**

In [None]:
for col in train:
  if train[col].dtypes == 'Int64' or train[col].dtypes == float:
    sns.histplot(train[col])
    # sns.histplot(train[col], kde=True) 
    plt.grid()
    plt.title(col + ' - examination of distribution')
    plt.show()

In [None]:
# here we fill missing cells of columns with normal distribution with their means
from statistics import mean

for col_name in ['BMI', 'ConversatiosPerDay', 'DisciplineScore', 'HouseholdExpenseOnPresents',
                 'PCR_7', 'PCR_72', 'PCR_89', 'SocialActivitiesPerDay', 'SportsPerDay']:
  if train[col_name].dtypes == 'Int64':
    train[col_name] = train[col_name].fillna(round(train[col_name].mean()))
  else:
    train[col_name] = train[col_name].fillna(train[col_name].mean())



  


In [None]:
# here we fill missing cells of columns with non-normal distribution with their median (which more meaning in this case than using mean)
for col_name in ['AgeGroup', 'HappinessScore', 'HouseholdExpenseOnSocialGames', 'HouseholdExpenseParkingTicketsPerYear',
                 'MedicalCarePerYear', 'NrCousins', 'PCR_19', 'PCR_95', 'StepsPerYear']:
  train[col_name] = train[col_name].fillna(train[col_name].median())

In [None]:
# notice we choose to fill only numerical data with good studiable distribution

## Outlier Detection

**Q 9**

In [None]:
col_list = [col for col in train.columns[1:-5] if train[col].dtypes == 'Int64' or train[col].dtypes == float]
for col in col_list:
  ax = sns.boxplot(x=train[col], y=train['Virus'])
  plt.show()

**Q 10**

**Z-score (only on data with close-to-normal distribution)**

In [None]:
for col in ['ConversatiosPerDay', 'BMI', 'HouseholdExpenseOnPresents', 'PCR_19', 'PCR_7',
            'PCR_72', 'PCR_89', 'SocialActivitiesPerDay', 'SocialMediaPerDay']:
  if train[col_name].dtypes == 'Int64':
    m = round(train[col].mean())
  else:
    m = train[col].mean()  
  tmp_col = (train[col] - m)/train[col].std(ddof=0)
  for index, row in train.iterrows():
    if train[col][index] is not np.nan and tmp_col[index] >= 3:
      train[col][index] = m

**BoxPlots - Finding Outliers**

In [None]:
# manully correcting outliner
m = train['DisciplineScore'].median() 
for index, row in train.iterrows():
    if train['DisciplineScore'][index] < 0 or train['DisciplineScore'][index] > 10:# outlier 
      train['DisciplineScore'][index] = m 
ax = sns.boxplot(x=train['DisciplineScore'], y=train['Virus'])
plt.show()

In [None]:
col_list = [col for col in train.columns[1:-5] if train[col].dtypes == 'Int64' or train[col].dtypes == float]
print(col_list)
for i in col_list:
  q1 = train[i].quantile(0.25)
  q3 = train[i].quantile(0.75)
  iqr = q3-q1 #Interquartile range
  fence_low  = q1-1.5*iqr
  fence_high = q3+1.5*iqr
  m = train[i].median()
  for index, row in train.iterrows():
    if train[i][index] is pd.NA or train[i][index] < fence_low or train[i][index] > fence_high:# outlier 
      train[i][index] = m 
  ax = sns.boxplot(x=train[i], y=train['Virus'])
  plt.show()

**Data Transformation - Normalization**

**Q 11**

In [None]:
sns.histplot(train.StepsPerYear) 
plt.grid()
plt.title('StepsPerYear - examination of distribution')
plt.show()

try to detect which normalization is better in case of this feature

In [None]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
steps_copy = train[['StepsPerYear']].copy() # we used copy to show we checked this option, but we chosed the next option which modify with
x_scaled = min_max_scaler.fit_transform(steps_copy)
steps_copy = x_scaled

sns.histplot(steps_copy) 
plt.grid()
plt.title('StepsPerYear - examination of distribution')
plt.show()

In [None]:
train['StepsPerYear'] = (train['StepsPerYear'] - train['StepsPerYear'].median())/train['StepsPerYear'].std(ddof=0)

sns.histplot(train.StepsPerYear) 
plt.grid()
plt.title(col + ' - examination of distribution')
plt.show()

**Q 12**

normalize all numeric data (not category)

In [None]:
for col in train.columns:
  if train[col].dtypes == float or (train[col].dtypes == 'Int64' and train[col].unique().size > 25):    
    train[col] = min_max_scaler.fit_transform(train[[col]])

train

**we rerun distribution plots for all attributes to see how they changed**

In [None]:
for col in train:
  if train[col].dtypes == 'Int64' or train[col].dtypes == float:
    sns.histplot(train[col])
    plt.grid()
    plt.title(col + ' - examination of distribution')
    plt.show()

# Part 3 - Feature Selecting

**Q 13 - correlation table**

In [None]:
train_copy = train.copy()
catagory_col = [col for col in train_copy.columns if 'Is_having_' in col]
catagory_col.extend(['Sex', 'BloodType'])
for col in catagory_col:
  dic_tmp = {}
  counter = 0
  for t in train_copy[col].unique():
    counter = counter+1
    dic_tmp[t] = counter
  train_copy[col] = train_copy[col].apply(lambda row: dic_tmp[row]).astype(float)

In [None]:
features = train_copy.columns.to_list()
# print(train_copy.columns)
f = plt.figure(figsize=(20, 15))
plt.matshow(train_copy.corr(), fignum=f.number)
plt.xticks(range(train_copy.select_dtypes(['number']).shape[1]), train_copy.select_dtypes(['number']).columns, fontsize=14, rotation=90)
plt.yticks(range(train_copy.select_dtypes(['number']).shape[1]), train_copy.select_dtypes(['number']).columns, fontsize=14)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)
_ = plt.title('Correlation Matrix', fontsize=16)


In [None]:
train.corr()
corr = train.corr()
kot = corr[corr>=.85]
plt.figure(figsize=(20,15))
sns.heatmap(kot, cmap="Greens")
plt.show()
print(train['PCR_83'].corr(train['PCR_45']))
print(train['HouseholdExpenseOnSocialGames'].corr(train['SportsPerDay']))
print(train['SocialActivitiesPerDay'].corr(train['HouseholdExpenseOnSocialGames']))


In [None]:
# manually removing att with high corollation
train.drop('NrCousins', axis='columns', inplace=True)
train.drop('PCR_83', axis='columns', inplace=True)
train.drop('SocialActivitiesPerDay', axis='columns', inplace=True)
train.drop('SportsPerDay', axis='columns', inplace=True)
train.drop('StepsPerYear', axis='columns', inplace=True)

**Q 14**

In [None]:
# explore the connections of targets attributes
train_copy = train[['Virus', 'SpreadLevel_scala', 'Risk_scala']]
train_copy=train_copy.dropna()
train_copy.SpreadLevel_scala = train_copy.SpreadLevel_scala.astype(int)
train_copy.Risk_scala = train_copy.Risk_scala.astype(int)
sns.jointplot(data=train_copy, x='SpreadLevel_scala', y='Risk_scala', hue='Virus')
plt.grid()
plt.title('SpreadLevel_scala - Risk_scala joint distribution')
plt.show()
# note: we did not found this plot useful  

**Filter methods**

we select according to joint attributes distribution plots 

In [None]:
from itertools import combinations
all_combos = [x for x in (combinations(train.columns[:-5],2))]
print(all_combos)
print(len(all_combos))

[('ID', 'AgeGroup'), ('ID', 'BMI'), ('ID', 'BloodType'), ('ID', 'ConversatiosPerDay'), ('ID', 'CurrentLocation'), ('ID', 'x_location'), ('ID', 'y_location'), ('ID', 'DateOfPCRTest'), ('ID', 'MonthOfPCRTest'), ('ID', 'DisciplineScore'), ('ID', 'HappinessScore'), ('ID', 'HouseholdExpenseOnPresents'), ('ID', 'HouseholdExpenseOnSocialGames'), ('ID', 'HouseholdExpenseParkingTicketsPerYear'), ('ID', 'TicketsPerYearGroup'), ('ID', 'MedicalCarePerYear'), ('ID', 'PCR_10'), ('ID', 'PCR_17'), ('ID', 'PCR_19'), ('ID', 'PCR_32'), ('ID', 'PCR_45'), ('ID', 'PCR_46'), ('ID', 'PCR_7'), ('ID', 'PCR_72'), ('ID', 'PCR_76'), ('ID', 'PCR_8'), ('ID', 'PCR_89'), ('ID', 'PCR_9'), ('ID', 'PCR_93'), ('ID', 'PCR_95'), ('ID', 'Self_declaration_of_Illness_Form'), ('ID', 'Symptoms_list'), ('ID', 'Is_having_Low_appetite'), ('ID', 'Is_having_Sore_throat'), ('ID', 'Is_having_Nausea_or_vomiting'), ('ID', 'Is_having_Muscle_or_body_aches'), ('ID', 'Is_having_Diarrhea'), ('ID', 'Is_having_Fever'), ('ID', 'Is_having_Fatigue

In [None]:
# manually explore joint attributes distribution
def joint_att_distribution(start, finish):
  for pair in all_combos[start:finish]:
    col1 = pair[0]
    col2 = pair[1]
    #   continue
    if (train[col1].dtypes == 'Int64' or train[col1].dtypes == float) and (train[col2].dtypes == 'Int64' or train[col2].dtypes == float):
      for label in ['Virus', 'Risk', 'SpreadLevel']:
        train_copy = train[[col1, col2, 'Virus', 'Risk', 'SpreadLevel']]
        train_copy.dropna
        if train_copy[col1].dtypes == 'Int64': # apperently this specific plot does not work with 'Int64' so i need to convert it to simple int
          train_copy[col1] = train_copy[col1].astype(int)
        if train_copy[col2].dtypes == 'Int64':
          train_copy[col2] = train_copy[col2].astype(int) 
        sns.jointplot(data=train_copy, x=col1, y=col2, hue=label)
        plt.grid()
        plt.title(col1 + ' - ' + col2 + ' joint distribution')
        plt.show()


In [None]:
joint_att_distribution(0, 400) # too much to print in one code-cell

In [None]:
joint_att_distribution(400, 800)

In [None]:
joint_att_distribution(800, 1200)

In [None]:
joint_att_distribution(1200, 1485)

In [None]:
# now we try the same thing but with the catagorial column (after modify them inorder to run the same code)
train_copy = train.copy()
catagory_col = [col for col in train_copy.columns if 'Is_having_' in col]
catagory_col.extend(['Sex', 'BloodType'])
print(catagory_col)
for col in catagory_col:
  dic_tmp = {}
  counter = 0
  for t in train_copy[col].unique():
    counter = counter+1
    dic_tmp[t] = counter
  train_copy[col] = train_copy[col].apply(lambda row: dic_tmp[row]).astype(float)

for col1 in catagory_col:
  for col2 in train_copy.columns[1:-5]:
    if col1 != col2 and (train_copy[col2].dtypes == 'Int64' or train_copy[col2].dtypes == float):
      for label in ['Virus', 'Risk', 'SpreadLevel']:
        train_copy2 = train_copy[[col1, col2, 'Virus', 'Risk', 'SpreadLevel']]
        train_copy2.dropna(inplace=True)
        if train_copy2[col2].dtypes == 'Int64':
          train_copy2[col2] = train_copy2[col2].astype(int) 
        sns.jointplot(data=train_copy2, x=col1, y=col2, hue=label)
        plt.grid()
        plt.title(col1 + ' - ' + col2 + ' joint distribution')
        plt.show()


**Wrapper method - forward**

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
training_model = LinearRegression()

col_names = [col for col in train.columns if train[col].dtypes == 'Int64' or train[col].dtypes == float or train[col].dtypes == bool]

train_copy = train.copy()
dic_tmp = {}
counter = 0
for t in train.Virus.unique():
  counter = counter+1
  dic_tmp[t] = counter
train_copy['Virus_numeric'] = train_copy.Virus.apply(lambda row: dic_tmp[row]).astype(int)
train_copy['Is_covid'] = train_copy.Virus.apply(lambda row: 1 if row == 'covid' else 0).astype(int)

col_names.extend(['Virus_numeric', 'Is_covid'])

train_copy = train_copy[col_names].dropna()

X =  train_copy[col_names[:-4]]
sfs = SFS(training_model, k_features=12, forward=True, floating=False)

print(col_names)
train_copy

In [None]:
y =  train_copy.Risk_scala

sfs.fit(X, y)

top_att1 = pd.DataFrame(sfs.subsets_).transpose()
top_att1

In [None]:
y =  train_copy.SpreadLevel_scala

sfs.fit(X, y)

top_att2 = pd.DataFrame(sfs.subsets_).transpose()
top_att2

In [None]:
y =  train_copy.Is_covid

sfs.fit(X, y)

top_att3 = pd.DataFrame(sfs.subsets_).transpose()
top_att3

In [None]:
y =  train_copy.Virus_numeric

sfs.fit(X, y)

top_att4 = pd.DataFrame(sfs.subsets_).transpose()
top_att4

In [None]:
chosen_att = []
chosen_att.extend(top_att1.feature_names[8])
chosen_att.extend(top_att2.feature_names[8])
chosen_att.extend(top_att3.feature_names[8])
chosen_att.extend(top_att4.feature_names[8])
chosen_att = list(set(chosen_att))
chosen_att.sort()
print(chosen_att)

['AgeGroup', 'BMI', 'ConversatiosPerDay', 'DisciplineScore', 'HappinessScore', 'HouseholdExpenseOnPresents', 'HouseholdExpenseOnSocialGames', 'ID', 'Is_having_Cough', 'Is_having_Diarrhea', 'Is_having_Fatigue', 'Is_having_Fever', 'Is_having_Shortness_of_breath', 'MedicalCarePerYear', 'PCR_10', 'PCR_17', 'PCR_19', 'PCR_32', 'PCR_72', 'PCR_8', 'PCR_89', 'PCR_9', 'PCR_95', 'SocialMediaPerDay']


**Wrapper method - backward**

In [None]:
col_to_check =  [col for col in col_names if col not in chosen_att]
training_model = KNeighborsClassifier(n_neighbors=30)

worst_att_list = []
for i in range(8):
  worst_score = float('inf')
  curr_worst = ''
  for col in col_to_check[:-4]:
    curr_cols = col_to_check.copy()
    curr_cols.remove(col)
    X = train_copy[curr_cols]
    y = train_copy.Is_covid
    sbs = SFS(training_model, forward=False, floating=False, k_features=len(curr_cols))
    sbs = sbs.fit(X, y)
    curr_score = pd.DataFrame(sbs.subsets_).transpose().iloc(0)[0][0]
    # print('BP - ' + col + ', score: ', curr_score)
    if worst_score > curr_score:
      worst_score = curr_score
      curr_worst = col
  worst_att_list.append(curr_worst)
  col_to_check.remove(curr_worst)

worst_att_list

**Bi-Variate Analysis**

In [None]:
# manually explore attributes distribution with the targets attributes
for col in train.columns:
  if train[col].dtypes != object and train[col].unique().size < 25 :
    for label in ['Virus', 'Risk', 'SpreadLevel']:
      new_plot = pd.crosstab(train[col], train[label])
      new_plot.plot(kind='bar', stacked=True,  grid=False)
      plt.grid()
      plt.show()


**Q 15**

In [None]:
data = []
labels = [x for x in dataset.columns if 'Is_having' not in x and '_scala' not in x]

manually_unwanted = ['Address', 'CurrentLocation', 'Job', 'PCR_11', 'PCR_15','PCR_83', 'NrCousins', 'SportsPerDay', 'SocialActivitiesPerDay', 'StepsPerYear']
manually_probably_not_intresting = ['StudingPerDay']
manually_wanted = ['Self_declaration_of_Illness_Form', 'Virus', 'SpreadLevel', 'Risk']
manually_remove = ['Region', 'TicketsPerYearGroup', 'MonthOfPCRTest', 'Symptoms_list', 'x_location', 'y_location']

for col in labels:
  if col in manually_remove:
    continue
  elif col in worst_att_list or col in manually_unwanted or col in manually_probably_not_intresting:
    data.append([col, 'N'])
  elif col in chosen_att or col in manually_wanted:
    data.append([col, 'Y'])
  else:
    data.append([col, 'Y']) # left over
    # data.append([col, '???']) # for debug only

df = pd.DataFrame(data, columns = ['Attribute', 'Take_or_throw'])
df

In [None]:
thrown_col = df[df.Take_or_throw=='N'].Attribute.to_list()
print(len(thrown_col), "not taken:")
print(thrown_col)
taken_col = df[df.Take_or_throw=='Y'].Attribute.to_list()
print(len(taken_col), "taken:")
print(taken_col)

**Q 16**

In [None]:
# # not sure if needed or not...
# from sklearn.model_selection import train_test_split
# tmp, test = train_test_split(dataset, test_size=0.2, random_state=12)
# train, validation =  train_test_split(tmp, test_size=0.25, random_state=12)

# # we are going to modify slices of dataset so pandas see it as 'SettingWithCopyWarning'. those msg are not relevant and annoying so...
# pd.options.mode.chained_assignment = None  # default='warn'


In [None]:
train_modify = train[taken_col].copy()
validation_modify = validation[taken_col].copy()
test_modify = test[taken_col].copy()
# print(train_modify.columns) # only for debug
# train_modify

Index(['ID', 'AgeGroup', 'BMI', 'BloodType', 'ConversatiosPerDay',
       'DateOfPCRTest', 'DisciplineScore', 'HappinessScore',
       'HouseholdExpenseOnPresents', 'HouseholdExpenseOnSocialGames',
       'MedicalCarePerYear', 'PCR_10', 'PCR_17', 'PCR_19', 'PCR_32', 'PCR_72',
       'PCR_8', 'PCR_89', 'PCR_9', 'PCR_95',
       'Self_declaration_of_Illness_Form', 'Sex', 'SocialMediaPerDay', 'Virus',
       'SpreadLevel', 'Risk'],
      dtype='object')

In [None]:
from google.colab import files

outputPath = "train_file.csv"
train.to_csv(outputPath)
files.download(outputPath) 

outputPath = "train_modify_file.csv"
train_modify.to_csv(outputPath)
files.download(outputPath) 

outputPath = "validation.csv"
validation.to_csv(outputPath)
files.download(outputPath) 

outputPath = "validation_modify_file.csv"
validation_modify.to_csv(outputPath)
files.download(outputPath) 

outputPath = "test_file.csv"
test.to_csv(outputPath)
files.download(outputPath) 

outputPath = "test_modify_file.csv"
test_modify.to_csv(outputPath)
files.download(outputPath)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# TYOTA


In [None]:
# # TOYOTA - for isolte running and mini tests
# col1 = 'BMI'
# col2 = 'HouseholdExpenseOnPresents'
# if (train[col1].dtypes == 'Int64' or train[col1].dtypes == float) and col1 != col2 and (train[col2].dtypes == 'Int64' or train[col2].dtypes == float):
#   for label in ['Virus', 'Risk', 'SpreadLevel']:
#     train_copy = train[[col1, col2, 'Virus', 'Risk', 'SpreadLevel']]
#     train_copy.dropna(inplace=True)
#     if train_copy[col1].dtypes == 'Int64': # apperently this specific plot does not work with 'Int64' so i need to convert it to simple int
#       train_copy[col1] = train_copy[col1].astype(int)
#     if train_copy[col2].dtypes == 'Int64':
#       train_copy[col2] = train_copy[col2].astype(int) 
#     sns.jointplot(data=train_copy, x=col1, y=col2, hue=label)
#     plt.grid()
#     plt.title(col1 + ' - ' + col2 + ' joint distribution')
#     plt.show()


In [None]:
# # TOYOTA - for isolte running and mini tests
# for col1 in ['PCR_89']:
#   for col2 in train.columns[1:-5]:

#     if (train[col1].dtypes == 'Int64' or train[col1].dtypes == float) and col1 != col2 and (train[col2].dtypes == 'Int64' or train[col2].dtypes == float):
#       for label in ['Virus', 'Risk', 'SpreadLevel']:
#         train_copy = train[[col1, col2, 'Virus', 'Risk', 'SpreadLevel']]
#         train_copy.dropna(inplace=True)
#         if train_copy[col1].dtypes == 'Int64': # apperently this specific plot does not work with 'Int64' so i need to convert it to simple int
#           train_copy[col1] = train_copy[col1].astype(int)
#         if train_copy[col2].dtypes == 'Int64':
#           train_copy[col2] = train_copy[col2].astype(int) 
#         sns.jointplot(data=train_copy, x=col1, y=col2, hue=label)
#         plt.grid()
#         plt.title(col1 + ' - ' + col2 + ' joint distribution')
#         plt.show()
