In [3]:
# Import Packages
import pandas as pd
import numpy as np
from matplotlib import pyplot
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn import svm
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from collections import Counter

In [14]:
batch_data = pd.read_csv("../../data/datasets/batch_meta_combine.csv")

# Rename herd to keep it in regex
batch_data = batch_data.rename(columns={'herd': 'herdx'})

#Split into X and Y
X = batch_data.filter(regex=("x.*"))
y = batch_data['group_class']


herds = X['herdx']
# Dropherd as it is now encoded
X = X.drop('herdx',axis = 1)


# Subset X data to most relevant features
rf_features = pd.read_csv('../../data/datasets/rf_features.csv')

#feat_to_keep = rf_features['feature'].head(25)

X = X[rf_features['feature']]

# Split into Training and Test Datasets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Split Training Data into Train and validation

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=123456)

# Find Names of Herds
X_train_herds = pd.merge(X_train,herds,left_index=True, right_index=True)['herdx']
X_val_herds = pd.merge(X_val,herds,left_index=True, right_index=True)['herdx']
X_test_herds = pd.merge(X_test,herds,left_index=True, right_index=True)['herdx']


# Standardize X data
scaler = preprocessing.StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)




np.savetxt("../../data/datasets/X_train_scaled.csv", X_train_scaled, delimiter=",")
X_train_herds.to_csv("../../data/datasets/X_train_herds.csv",index=False)
y_train.to_csv("../../data/datasets/y_train.csv",index=False)

np.savetxt("../../data/datasets/X_val_scaled.csv", X_val_scaled, delimiter=",")
X_val_herds.to_csv("../../data/datasets/X_val_herds.csv",index=False)
y_val.to_csv("../../data/datasets/y_val.csv",index=False)

np.savetxt("../../data/datasets/X_test_scaled.csv", X_test_scaled, delimiter=",")
X_test_herds.to_csv("../../data/datasets/X_test_herds.csv",index=False)
y_test.to_csv("../../data/datasets/y_test.csv",index=False)


In [15]:
# Upsample Training Data

oversample = SMOTE()
X_train_scaled_oversample, y_train_oversample = oversample.fit_resample(X_train_scaled, y_train)

np.savetxt("../../data/datasets/X_train_scaled_oversample.csv", X_train_scaled_oversample, delimiter=",")
y_train_oversample.to_csv("../../data/datasets/y_train_oversample.csv",index=False)


