In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files

# Section 1: Quick data exploration and preparation

In [None]:
df_raw = pd.read_csv('https://raw.githubusercontent.com/sivanyo/ML-HW3/main/variant_labeled.csv')
df_raw

**Q1**

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df_raw, test_size=0.2, random_state=12)

# we are going to modify slices of dataset so pandas see it as 'SettingWithCopyWarning'. those msg are not relevant and annoying so...
pd.options.mode.chained_assignment = None  # default='warn'

**Q2**

In [None]:
# dont know what to do - sex is categorial 
tmp = train.Sex.apply(lambda x: 1 if x=='M' else 0)  # we know there are no NaN vals in this col
sns.kdeplot(data=tmp)

**Q3**

In [None]:
g = sns.FacetGrid(train, col="BloodType", height=3.5, aspect=.65, col_wrap=4)
g.map(sns.kdeplot, "VariantScore")
g.set_titles(col_template="{col_name}", size=18)
g.set_xlabels(size=18)
for ax in g.axes:
 ax.grid(alpha=0.5)

# Preprocessing (data preparation)


In [None]:
def category_to_numeric(df):
  normal_dist = ['O+', 'O-', 'B+', 'B-']
  df['BloodType'] = df.BloodType.astype(str)
  df['BloodType'] = df['BloodType'].apply(lambda x : 0 if x in normal_dist else 1)
  df['BloodType'] = df.BloodType.astype(float)
  df['Sex'] = df['Sex'].apply(lambda x : 1 if x=='M' else 0)

category_to_numeric(train)
category_to_numeric(test)

In [None]:
train.corr()
corr = train.corr()
kot = corr[corr>=.9]
plt.figure(figsize=(15,10))
sns.heatmap(kot, cmap="Greens")
plt.show()

In [None]:
def remove_corr_feat(df):
  df.drop('NrCousins', axis=1, inplace=True)
  df.drop('StepsPerYear', axis=1, inplace=True)
  df.drop('HouseholdExpenseParkingTicketsPerYear', axis=1, inplace=True)
  df.drop('HouseholdExpenseOnSocialGames', axis=1, inplace=True)
  # df.drop('HouseholdExpenseOnPresents', axis=1, inplace=True)

remove_corr_feat(train)
remove_corr_feat(test)

In [None]:
# this df will use us for the data preparation of the final test
ppt = pd.DataFrame(columns = ['mean', 'median', 'max', 'min', 'high outlier', 'low outlier'])
for col in train.columns.to_list()[1:-1]:
  ppt.loc[col] = [train[col].mean(),train[col].median(),0,0,0,0]
# ppt

In [None]:
# outliers and fillna

alpha_all = 3.5
alpha_for_pcr = 5

for col in train.columns.to_list()[1:-1]:
  q1 = train[col].quantile(0.25)
  q3 = train[col].quantile(0.75)
  iqr = q3-q1 
  current_alpha = alpha_all
  if 'PCR_' in col:
    current_alpha = alpha_for_pcr
  fence_low  = q1-current_alpha*iqr
  if 'PCR_' not in col and fence_low < 0:
    fence_low = 0
  fence_high = q3+current_alpha*iqr
  ppt.at[col, 'low outlier'] = fence_low
  ppt.at[col, 'high outlier'] = fence_high

ppt.at['AgeGroup', 'low outlier'] = 0  # manually set this 3 cols
ppt.at['AgeGroup', 'high outlier'] = 9
ppt.at['HappinessScore', 'low outlier'] = 1
ppt.at['HappinessScore', 'high outlier'] = 10
ppt.at['DisciplineScore', 'low outlier'] = 1
ppt.at['DisciplineScore', 'high outlier'] = 10


def fill_outliers(df):
  for col in train.columns.to_list()[1:-1]:
    m = ppt.at[col, 'median']
    fence_low = ppt.at[col, 'low outlier']
    fence_high = ppt.at[col, 'high outlier']
    df[col] = df[col].apply(lambda row: m if (pd.isnull(row) or row < fence_low or row > fence_high) else row)


fill_outliers(train)
fill_outliers(test)

In [None]:
#normalization
for col in train.columns.to_list()[1:-1]:
  ppt.at[col, 'max'] = train[col].max()
  ppt.at[col, 'min'] = train[col].min()


def normalize(df):
  for col in train.columns[1:-1]:
    min = ppt.at[col, 'min']
    max = ppt.at[col, 'max']
    df[col] =  df[col].apply(lambda row: (row-min)/(max-min))


normalize(train)
normalize(test)

In [None]:
def preprocessing_data(df):
  category_to_numeric(df)
  remove_corr_feat(df)
  fill_outliers(df)
  normalize(df)

  
ppt

In [None]:
print(train.columns)
train

# Section 2: Evaluation


In [None]:
errors_data = pd.DataFrame(columns= ['section', 'Train MSE', 'Validation MSE'])
errors_data.loc['Dummy'] = [2, 0, 0]
errors_data.loc['Basic Linear'] = [3, 0, 0]
errors_data.loc['Multilevel linear'] = [4, 0, 0]
errors_data.loc['Multilevel poly'] = [5, 0, 0]

In [None]:
attributes = train.columns.to_list()
features = attributes[1:-1]

**Q6**

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error, make_scorer

def CV_evaluation(h, X_train, y_train, n_splits=5):
  scores = cross_validate(h, X_train, y_train, cv=n_splits,
  scoring=make_scorer(mean_squared_error),
  return_train_score=True)
  
  train_mse = scores['train_score'].mean()
  valid_mse = scores['test_score'].mean()
  
  return (train_mse, valid_mse)

**Q7**

In [None]:
from sklearn.dummy import DummyRegressor

dummy_regressor = DummyRegressor(strategy="mean")
# dummy_regressor.fit(train)

x,y = CV_evaluation(dummy_regressor, train[features], train['VariantScore'])
errors_data.loc['Dummy']['Train MSE'] = x
errors_data.loc['Dummy']['Validation MSE'] = y

errors_data

# Section 3: Basic linear regression

**Q8**

In [None]:
from sklearn.linear_model import Ridge
from matplotlib.pyplot import semilogx

def tuning(data, features_list, print_flag=True):
  train_scores, validation_scores = [], []
  alpha_values = np.logspace(0, 5, num=100)
  best_score_on_train, best_score_on_validation, best_alpha = 1, 1, 0
  for val in alpha_values:
    ridge = Ridge(val)
    error = CV_evaluation(ridge, data[features_list], data['VariantScore'])
    train_scores.append(error[0])
    validation_scores.append(error[1])
    if error[1] < best_score_on_validation:
      best_score_on_validation = error[1]
      best_score_on_train = error[0]
      best_alpha = val
  
  if print_flag:
    plt.semilogx(alpha_values, train_scores, 'r', label='train')
    plt.semilogx(alpha_values, validation_scores, 'b', label='validation')
    plt.legend()
    plt.grid(True)
    plt.title("Ridge classifier error as a function of alpha values")
    plt.show()

  return best_score_on_train, best_score_on_validation, best_alpha

best_score_on_train, best_score_on_validation, best_alpha = tuning(train, features_list=features) # got (0,0) means there is no regularization at all

#  make sure this is the right plot + scores

**Q9**

In [None]:
errors_data.loc['Basic Linear']['Train MSE'] = best_score_on_train
errors_data.loc['Basic Linear']['Validation MSE'] = best_score_on_validation

errors_data

**Q10**

In [None]:
print(best_alpha)
ridge = Ridge(best_alpha)
ridge.fit(train[features], train['VariantScore'])
print('w = ',ridge.coef_)
# pd.Series(ridge.coef_, index = features).nlargest(10).plot(kind='barh')

tmp = pd.Series(ridge.coef_, index = features)
tmp = tmp.apply(lambda x: x if x>0 else -x)
tmp = tmp.sort_values(axis=0)
tmp.nlargest(10).plot(kind='barh')


# Section 4: Hierarchical linear regression

**Q11**

In [None]:
sns.kdeplot(data = train, x='AgeGroup', y='VariantScore', hue='Sex', color='b')

now we will split the trainig set to feamles and males

In [None]:
females = train[train['Sex'] == 0]
males = train[train['Sex'] == 1]

**Q12**

In [None]:
SexDataFrame = pd.DataFrame(columns= ['Train MSE', 'Validation MSE', 'Best alpha'])
SexDataFrame.loc['Females'] = [0,0,0]
SexDataFrame.loc['Males'] = [0,0,0]

SexDataFrame.loc['Females']['Train MSE'], SexDataFrame.loc['Females']['Validation MSE'], SexDataFrame.loc['Females']['Best alpha'] = tuning(females, features)

SexDataFrame.loc['Males']['Train MSE'], SexDataFrame.loc['Males']['Validation MSE'], SexDataFrame.loc['Males']['Best alpha'] = tuning(males, features)

SexDataFrame

In [None]:
best_score_on_train_f, best_score_on_validation_f, best_alpha_f = tuning(train[train['Sex'] == 0], features)
best_score_on_train_m, best_score_on_validation_m, best_alpha_m = tuning(train[train['Sex'] == 1], features)

In [None]:
from sklearn.base import BaseEstimator, RegressorMixin
class MultiRegressor(BaseEstimator, RegressorMixin):
  def __init__(self, h_male, h_female):
    self.h_male = h_male
    self.h_female = h_female

  def fit(self, X, y):
    x_m = X[X['Sex'] == 1]
    y_m = y[x_m.index]
    x_m = x_m.drop(['Sex'], axis = 1)
    self.h_male.fit(x_m, y_m)

    x_f = X[X['Sex'] == 0]
    y_f = y[x_f.index]
    x_f = x_f.drop(['Sex'], axis = 1)
    self.h_female.fit(x_f, y_f)
    return self

  def predict(self, X):
    # X should be a pandas dataframe
    all_predictions = []

    for index, x in X.iterrows():
      fixed_x = x.drop(['Sex'])
      y_pred = self.h_female.predict([fixed_x]) if x['Sex'] == 0 else self.h_male.predict([fixed_x])
      all_predictions.append(y_pred[0])

    return all_predictions

In [None]:
multi = MultiRegressor(Ridge(best_alpha_m), Ridge(best_alpha_f))

**Q13**

In [None]:
errors = CV_evaluation(multi, train[features], train['VariantScore'])

errors_data.loc['Multilevel linear']['Train MSE'] = errors[0]
errors_data.loc['Multilevel linear']['Validation MSE'] = errors[1]

errors_data

**Q14**

In [None]:
multi.fit(train[features], train['VariantScore'])

# Section 5: Polynomial fitting

creating the new data frame

In [None]:
def df_to_quad_df(df):
  tmp_df = np.power(df, 2)
  tmp_df.rename(columns=lambda x: x+"_quadratic", inplace=True)
  df_quad = pd.concat([df ,tmp_df], axis=1, ignore_index=True)
  df_quad.columns = list(df.columns) + list(tmp_df.columns)
  for col in ['Sex_quadratic', 'ID_quadratic', 'BloodType_quadratic', 'VariantScore_quadratic']:
    if (col in df_quad.columns):
      df_quad.drop(col, axis=1, inplace=True)
  return df_quad

train_quadratic = df_to_quad_df(train)
test_quadratic = df_to_quad_df(test)

quad_features = train_quadratic.columns.to_list()
quad_features.remove('VariantScore')
quad_features.remove('ID')

**Q17**

In [None]:
# this is what they mean ?
SexDataFrame2 = pd.DataFrame(columns= ['Train MSE', 'Validation MSE', 'Best alpha'])
SexDataFrame2.loc['Females'] = [0,0,0]
SexDataFrame2.loc['Males'] = [0,0,0]

best_score_on_train_f, best_score_on_validation_f, best_alpha_f = tuning(train_quadratic[train_quadratic['Sex'] == 0], features_list=quad_features)
best_score_on_train_m, best_score_on_validation_m, best_alpha_m = tuning(train_quadratic[train_quadratic['Sex'] == 1], features_list=quad_features)

SexDataFrame2.loc['Females']['Train MSE'], SexDataFrame2.loc['Females']['Validation MSE'], SexDataFrame2.loc['Females']['Best alpha'] = best_score_on_train_f, best_score_on_validation_f, best_alpha_f
SexDataFrame2.loc['Males']['Train MSE'], SexDataFrame2.loc['Males']['Validation MSE'], SexDataFrame2.loc['Males']['Best alpha'] = best_score_on_train_m, best_score_on_validation_m, best_alpha_m

SexDataFrame2

In [None]:
multi_polynomial = MultiRegressor(Ridge(best_alpha_m), Ridge(best_alpha_f))
multi_polynomial

**Q18**

In [None]:
multiScoreDF = pd.DataFrame(columns= ['Multilevel Model','Section', 'Sex', 'Train MSE', 'Valid MSE'])
multiScoreDF = multiScoreDF.set_index(["Multilevel Model", "Section", "Sex"])
for gender in ['M', 'F']:
  multiScoreDF.loc['Linear', 4, gender] = [0,0]
  multiScoreDF.loc['Polynomial', 5, gender] = [0,0]
# multiScoreDF

In [None]:
Ridge_males = Ridge(best_alpha_m)
Ridge_females = Ridge(best_alpha_f)

err_train_m ,err_val_m = CV_evaluation(Ridge_males, males[features], males['VariantScore'])
err_train_f ,err_val_f = CV_evaluation(Ridge_females, females[features], females['VariantScore'])

multiScoreDF.loc['Linear', 4, 'M'] = [err_train_m ,err_val_m]
multiScoreDF.loc['Linear', 4, 'F'] = [err_train_f ,err_val_f]

quad_m = train_quadratic[train_quadratic['Sex'] == 1]
quad_f = train_quadratic[train_quadratic['Sex'] == 0]


err_train_m_quad ,err_val_m_quad = CV_evaluation(Ridge_males, quad_m[quad_features], quad_m['VariantScore'])
err_train_f_quad ,err_val_f_quad = CV_evaluation(Ridge_females, quad_f[quad_features], quad_f['VariantScore'])

multiScoreDF.loc['Polynomial', 5, 'M'] = [err_train_m_quad ,err_val_m_quad]
multiScoreDF.loc['Polynomial', 5, 'F'] = [err_train_f_quad ,err_val_f_quad]

multiScoreDF

**Q20**

In [None]:
errors = CV_evaluation(multi_polynomial, train_quadratic[quad_features], train_quadratic['VariantScore'])

errors_data.loc['Multilevel poly']['Train MSE'] = errors[0]
errors_data.loc['Multilevel poly']['Validation MSE'] = errors[1]

errors_data

# Section 6: Testing our models

In [None]:
tmp_arr = [0,0,0,0]
dummy_regressor.fit(train[features], train['VariantScore'])
tmp_arr[0] = mean_squared_error(test['VariantScore'], dummy_regressor.predict(test[features]))

ridge = Ridge(best_alpha)
ridge.fit(train[features], train['VariantScore'])
tmp_arr[1] = mean_squared_error(test['VariantScore'], ridge.predict(test[features]))

multi.fit(train[features], train['VariantScore'])
tmp_arr[2] = mean_squared_error(test['VariantScore'], multi.predict(test[features]))

multi_polynomial.fit(train_quadratic[quad_features], train_quadratic['VariantScore'])
tmp_arr[3] = mean_squared_error(test_quadratic['VariantScore'], multi_polynomial.predict(test_quadratic[quad_features]))

errors_data2 = errors_data.copy()
errors_data2['Test MSE'] = tmp_arr
errors_data2

# Section 7: Custom models challenge

In [None]:
def get_top_feat(df, classifier_ctr, alpha, features_list):
  clf = classifier_ctr(alpha)
  clf.fit(df[features_list], df['VariantScore'])
  #print('w = ',clf.coef_)

  top_feat = pd.Series(clf.coef_, index = features_list)
  top_feat = top_feat.apply(lambda x: x if x>0 else -x)
  top_feat = top_feat.sort_values(axis=0)

  # pd.Series(top_feat, index = features).nlargest(30).plot(kind='barh')
  return top_feat, list(top_feat.keys())


def tuning4 (df, classifier_ctr, features_list):
  best_score = 100
  best_alpha = -1000
  for a in range(0,10000,25):
    clf = classifier_ctr(alpha=a/1000)
    ret = CV_evaluation(clf, df[features_list], df['VariantScore'])
    # print("a: " + str(a) + ", score: " + str(ret[1]))
    if ret[1] < best_score:
      best_score = ret[1]
      best_alpha = a/1000
  print("best_score is: " + str(best_score))
  print("best_alpha is: " + str(best_alpha))
  return (best_score, best_alpha)


def tuning_num_of_feat (df, classifier_ctr, alpha, features_list):
  best_score = 100
  best_n = 0
  for n in range(5,len(features_list), 2):
    select_feat = features_list[-1*n:]
    clf = classifier_ctr(alpha)
    ret = CV_evaluation(clf, df[select_feat], df['VariantScore'])
    if ret[1] < best_score:
      best_score = ret[1]
      best_n = n
  print("best_score is: " + str(best_score))
  print("best_n is: " + str(best_n))
  return (best_score, best_n)

quad_features_no_sex = quad_features.copy()
quad_features_no_sex.remove('Sex')
quad_features_no_sex_no_blood = quad_features_no_sex.copy()
quad_features_no_sex_no_blood.remove('BloodType')

In [None]:
def build_clf(df, classifier_ctr, features_list):
  best_multiExtra=0
  feats = features_list
  best_n_feat = 666

  for i in range(1,4):
    tmp, best_multiExtra = tuning4(df, classifier_ctr, feats[-1*best_n_feat:])
    tmp, feats = get_top_feat(df, classifier_ctr, best_multiExtra, feats)
    tmp, best_n_feat = tuning_num_of_feat(df, classifier_ctr, best_multiExtra, feats)

  clf = classifier_ctr(best_multiExtra)
  clf.fit(df[feats[-1*best_n_feat:]], df['VariantScore'])
  return clf, feats[-1*best_n_feat:]

In [None]:
df_m_b0 = quad_m[quad_m['BloodType'] == 0]
df_m_b1 = quad_m[quad_m['BloodType'] == 1]
df_f_b0 = quad_f[quad_f['BloodType'] == 0]
df_f_b1 = quad_f[quad_f['BloodType'] == 1]

clf_m_b0, clf_m_b0_feat = build_clf(df_m_b0, Ridge, quad_features_no_sex_no_blood)
clf_m_b1, clf_m_b1_feat = build_clf(df_m_b1, Ridge, quad_features_no_sex_no_blood)
clf_f_b0, clf_f_b0_feat = build_clf(df_f_b0, Ridge, quad_features_no_sex_no_blood)
clf_f_b1, clf_f_b1_feat = build_clf(df_f_b1, Ridge, quad_features_no_sex_no_blood)

In [None]:
def custom_pred(X):
  all_pred = []
  for index, x in X.iterrows():
    y_pred = 666
    if x['Sex'] == 1:
      if x['BloodType'] == 0:
        y_pred = clf_m_b0.predict([x[clf_m_b0_feat]])
      else:
        y_pred = clf_m_b1.predict([x[clf_m_b1_feat]])
    else:
      if x['BloodType'] == 0:
        y_pred = clf_f_b0.predict([x[clf_f_b0_feat]])
      else:
        y_pred = clf_f_b1.predict([x[clf_f_b1_feat]])
    all_pred.append(y_pred[0])
  return all_pred

final_res = mean_squared_error(test_quadratic['VariantScore'], custom_pred(test_quadratic))
print(final_res)

In [None]:
final_res = mean_squared_error(test_quadratic['VariantScore'], custom_pred(df_to_quad_df(test)))
print(final_res)  # used as validation

# Section 8: Submitted model predictions

In [None]:
unlabeled_raw = pd.read_csv('https://raw.githubusercontent.com/sivanyo/ML-HW3/main/variant_unlabeled.csv')
# unlabeled_raw

In [None]:
unlabeled = unlabeled_raw.copy()
preprocessing_data(unlabeled)
# unlabeled

In [None]:
all_df = df_raw.copy()
preprocessing_data(all_df) 
all_df_quad = df_to_quad_df(all_df)

In [None]:
quad_all_m = all_df_quad[all_df_quad['Sex'] == 1]
quad_all_f = all_df_quad[all_df_quad['Sex'] == 0]

df_m_b0 = quad_all_m[quad_all_m['BloodType'] == 0]
df_m_b1 = quad_all_m[quad_all_m['BloodType'] == 1]
df_f_b0 = quad_all_f[quad_all_f['BloodType'] == 0]
df_f_b1 = quad_all_f[quad_all_f['BloodType'] == 1]

clf_m_b0, clf_m_b0_feat = build_clf(df_m_b0, Ridge, quad_features_no_sex_no_blood)
clf_m_b1, clf_m_b1_feat = build_clf(df_m_b1, Ridge, quad_features_no_sex_no_blood)
clf_f_b0, clf_f_b0_feat = build_clf(df_f_b0, Ridge, quad_features_no_sex_no_blood)
clf_f_b1, clf_f_b1_feat = build_clf(df_f_b1, Ridge, quad_features_no_sex_no_blood)

In [None]:
pred_unlabeled = pd.DataFrame(index=np.arange(unlabeled.shape[0]), columns = ['ID', 'VariantScore'])
pred_unlabeled['ID'] = unlabeled['ID']
unlabeled_quad = df_to_quad_df(unlabeled)


ridge = Ridge(tuning(all_df, features, print_flag=False)[2])
ridge.fit(all_df[features], all_df['VariantScore'])
pred_unlabeled['VariantScore'] =  ridge.predict(unlabeled[features])
pred_unlabeled.to_csv("pred_3.csv", index=False)
files.download("pred_3.csv")


multi = MultiRegressor(Ridge(best_alpha_m), Ridge(best_alpha_f))
multi.fit(all_df[features], all_df['VariantScore'])
pred_unlabeled['VariantScore'] =  multi.predict(unlabeled[features])
pred_unlabeled.to_csv("pred_4.csv", index=False)
files.download("pred_4.csv")


multi_quad = MultiRegressor(Ridge(tuning(all_df_quad[all_df_quad['Sex'] == 1], features_list=quad_features, print_flag=False)[2]), Ridge(tuning(all_df_quad[all_df_quad['Sex'] == 0], print_flag=False, features_list=quad_features)[2]))
multi_quad.fit(all_df_quad[quad_features], all_df_quad['VariantScore'])
pred_unlabeled['VariantScore'] =  multi_quad.predict(unlabeled_quad[quad_features])
pred_unlabeled.to_csv("pred_5.csv", index=False)
files.download("pred_5.csv")


pred_unlabeled['VariantScore'] = custom_pred(unlabeled_quad)
pred_unlabeled.to_csv("pred_7.csv", index=False)
files.download("pred_7.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# טיוטה

In [None]:
# # this is to make sure we predicing unlabeled as we want to (check by trying the same proccess on test)
# unlabeled = test.copy()
# unlabeled.drop('VariantScore', axis=1, inplace=True)
# pred_unlabeled = pd.DataFrame(index=np.arange(unlabeled.shape[0]), columns = ['ID', 'VariantScore'])
# pred_unlabeled['ID'] = unlabeled['ID']
# unlabeled_quad = df_to_quad_df(unlabeled)

# ridge = Ridge(tuning(all_df, features, print_flag=False)[2])
# ridge.fit(all_df[features], all_df['VariantScore'])
# pred_unlabeled['VariantScore'] =  ridge.predict(unlabeled[features])
# pred_unlabeled.to_csv("pred_3.csv", index=False)
# # files.download("pred_3.csv")
# print("score3: ", mean_squared_error(pred_unlabeled['VariantScore'], test['VariantScore']))

# multi = MultiRegressor(Ridge(best_alpha_m), Ridge(best_alpha_f))
# multi.fit(all_df[features], all_df['VariantScore'])
# pred_unlabeled['VariantScore'] =  multi.predict(unlabeled[features])
# pred_unlabeled.to_csv("pred_4.csv", index=False)
# # files.download("pred_4.csv")
# print("score4: ", mean_squared_error(pred_unlabeled['VariantScore'], test['VariantScore']))

# multi_quad = MultiRegressor(Ridge(tuning(all_df_quad[all_df_quad['Sex'] == 1], features_list=quad_features, print_flag=False)[2]), Ridge(tuning(all_df_quad[all_df_quad['Sex'] == 0], print_flag=False, features_list=quad_features)[2]))
# multi_quad.fit(all_df_quad[quad_features], all_df_quad['VariantScore'])
# pred_unlabeled['VariantScore'] =  multi_quad.predict(unlabeled_quad[quad_features])
# pred_unlabeled.to_csv("pred_5.csv", index=False)
# # files.download("pred_5.csv")
# print("score5: ", mean_squared_error(pred_unlabeled['VariantScore'], test_quadratic['VariantScore']))

# pred_unlabeled['VariantScore'] = custom_pred(unlabeled_quad)
# pred_unlabeled.to_csv("pred_7.csv", index=False)
# # files.download("pred_7.csv")
# print("score7: ", mean_squared_error(pred_unlabeled['VariantScore'], test_quadratic['VariantScore']))


score3:  0.007203004031778517
score4:  0.0030599945277189444
score5:  0.002316613598073224
score7:  0.002146812888792962
