## Import Packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

## Data Preprocessing

### Buzasi Data

#### Import Tar file and Extract Postage Stamps 

In [None]:
import tarfile
my_tar = tarfile.open('C:/Users/ptgri/tess_postage_stamps.tar')
my_tar.extractall('./my_folder')
my_tar.close()

#### Write Postage Stamps to .txt file 

In [None]:
import os
os.chdir('C:/Users/ptgri/my_folder')

with open("Buzasi.txt", "w") as outfile:
    def write_text_file(file_path):
        with open(file_path) as f:
            contents = f.read()
            outfile.write(contents)
            outfile.write('\n')
    for file in os.listdir():
        if file.endswith(".txt"):
            file_path = f"{'C:/Users/ptgri/my_folder'}\{file}"
            write_text_file(file_path)

#### Import and process .txt file as .csv file

In [None]:
Buzasi = pd.read_csv('C:/Users/ptgri/my_folder/Buzasi.txt',header=None, sep=' ', quotechar='"', skip_blank_lines=False)
Buzasi = Buzasi.iloc[:49876]
Buzasi[['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25']] = (Buzasi[0].astype(str).str.split(',', expand=True).astype(float))
Buzasi = Buzasi.drop(columns=[0])
Buzasi = Buzasi.fillna(0)

In [None]:
Buzasi = pd.read_csv('C:/Users/ptgri/test.csv')

#### Zero-padding to ensure images have equal dimensions, then puts each image in a single row, and creates a NumPy array   

In [None]:
padded_matrix = []
i = 0
j = 0
k = 0
N = 3912
M = 3887
#change 3 to 'N' once Buzasi sends the images needed to be redone
for k in range(M):
    count = Buzasi.groupby((Buzasi['1'] == 0).cumsum()).cumcount(ascending=False)+1
    j = i + count[i]
    shape = np.shape(Buzasi.iloc[i:j])
    padded_array = np.zeros((27, 25))
    padded_array[:shape[0],:shape[1]] = Buzasi.iloc[i:j]
    padded_matrix.append(padded_array)
    i = j + 1 
    k = k + 1
padded_matrix = np.array(padded_matrix)
padded_matrix = padded_matrix.reshape(M,27*25)
X = padded_matrix[:2542]

### Oswalt Data

In [None]:
df = pd.read_excel('C:/Users/ptgri/Downloads/TESS targets Vizier physical data.xlsx',skiprows=2)
df = df.iloc[: , 49:]

In [None]:
bad_image = [110376, 11490327, 27014182, 54926434, 87480403, 94367286, 105438311, 125802413, 267624955, 
             298017097, 302158903, 332680754, 354825513, 373139866, 382422098, 382422180, 399665349, 
             410205809, 416233532, 427733653, 441709021, 441804568, 443616529, 447823435, 458343311]
df = df[df.TIC.isin(bad_image) == False]

In [None]:
df['ANR'] = ""
df.loc[df['SNR'] >= 4.0, 'ANR'] = 1
df.loc[df['SNR'] < 4.0, 'ANR'] = 0

In [None]:
df['Peaks'] = 1
df.loc[df['Mult [M#]'].str.contains('M', na=False), 'Peaks'] = 0

In [None]:
df['Rotators'] = 1
df.loc[df['Rot [R]'].str.contains('R', na=False), 'Rotators'] = 0

In [None]:
df['Pulsators'] = 1
df.loc[df['Pulse [P]'].str.contains('P', na=False), 'Pulsators'] = 0

In [None]:
df['Flares'] = 1
df.loc[df['Flare [F]'].str.contains('F', na=False), 'Flares'] = 0

In [None]:
df['Eclipses'] = 1
df.loc[df['Ecl [E]'].str.contains('E', na=False), 'Eclipses'] = 0

In [None]:
df['Discs'] = 1
df.loc[df['Disc [D]'].str.contains('D', na=False), 'Discs'] = 0

In [None]:
df['Blends'] = 1
df.loc[df['Blend [B]'] == 'B', 'Blends'] = 0
df.loc[df['Blend [B]'] == '?', 'Blends'] = 0
df.loc[df['Blend [B]'] == 'B?', 'Blends'] = 0

In [None]:
df['Trails'] = 1
df.loc[df['Trail [T]'] == 'T', 'Trails'] = 0
df.loc[df['Trail [T]'] == '?', 'Trails'] = 0
df.loc[df['Trail [T]'] == 'T?', 'Trails'] = 0

In [None]:
df['Sky Background'] = 1
df.loc[df['Sky [S]'] == 'S', 'Sky Background'] = 0
df.loc[df['Sky [S]'] == '?', 'Sky Background'] = 0
df.loc[df['Sky [S]'] == 'S?', 'Sky Background'] = 0

In [None]:
df['Aperture'] = 1
df.loc[df['Apert [A]'] == 'A', 'Aperture'] = 0
df.loc[df['Apert [A]'] == '?', 'Aperture'] = 0
df.loc[df['Apert [A]'] == 'A?', 'Aperture'] = 0

In [None]:
df['Quality'] = ""
df.loc[df['Redo (X)'] == 'X', 'Quality'] = 0
df.loc[df['Redo (X)'] == 'OK', 'Quality'] = 1
df.loc[df['Redo (X)'] == 'OK?', 'Quality'] = 1

In [None]:
df = df.dropna(axis=0, subset=['Redo (X)'])
df = df[['TIC','ANR','Peaks','Rotators','Pulsators','Flares','Eclipses','Discs','Blends','Trails','Sky Background','Aperture','Quality']]
y = df['Quality'].to_numpy()
y = y.astype('int64')

### Splitting Data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.15, stratify = y, random_state = 1)

## Decision Tree Classifier

In [None]:
clf = tree.DecisionTreeClassifier(random_state = 1)
clf.fit(X_train, y_train)

### Performance on Training Data

In [None]:
ypred_train = clf.predict(X_train)
accuracy_score(y_train,ypred_train)

### Performance on Testing Data

In [None]:
ypred_test = clf.predict(X_test)
accuracy_score(y_test,ypred_test)

### Information on the produced Decision Tree 

In [None]:
print('Tree depth:', clf.get_depth())
print('Number of leaves:',clf.get_n_leaves())

## Improving the Model

In [None]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.15, stratify = y_train, random_state=1)

In [None]:
print(X_tr.shape, X_val.shape)

### Use Validation Error to select best model

In [None]:
size = len(X_tr)
choices = np.arange(2,100)
per = []
for j in choices:
    clf = tree.DecisionTreeClassifier(random_state=1, max_leaf_nodes=j)
    clf.fit(X_tr,y_tr)
    pred = clf.predict(X_val)
    error = 1 - accuracy_score(y_val,pred)
    per.append(error)

In [None]:
plt.plot(choices,per)
plt.show()

In [None]:
opt_nodes = choices[np.argmin(np.array(per))]
plt.plot(choices,per)
plt.axvline(x=opt_nodes, color = 'k')
plt.title('The best number of max leaves: '+str(opt_nodes),size = 15)
plt.xlabel('Maximum number of leaves allowed',size = 12)
plt.ylabel('Err(val)',size = 12)
plt.show()

## Final Model Performance

In [None]:
clf = tree.DecisionTreeClassifier(random_state=1, max_leaf_nodes=opt_nodes)
clf.fit(X_train,y_train)

In [None]:
ypred_test = clf.predict(X_test)
accuracy_score(y_test,ypred_test)