In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import seaborn as sn
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, LinearSVR 
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, balanced_accuracy_score, precision_score, recall_score, classification_report
from sklearn.preprocessing import StandardScaler


In [2]:
df=pd.read_csv("../Res/processed_raw_data.csv")


In [3]:
df

Unnamed: 0,Vmag,Plx,e_Plx,B-V,SpType,Amag,target
0,9.27,21.90,3.10,0.999,K3V,20.972221,0.0
1,8.06,7.75,0.97,0.370,F0V,17.506509,0.0
2,8.55,2.87,1.11,0.902,G8III,15.839409,1.0
3,12.31,18.80,4.99,1.336,M0V:,23.680789,0.0
4,8.59,10.76,1.10,0.489,F6V,18.749061,0.0
...,...,...,...,...,...,...,...
44563,8.05,6.24,1.17,1.157,K2III,17.025923,1.0
44564,9.26,2.68,1.36,0.521,F5V,16.400674,0.0
44565,8.79,0.89,1.28,1.194,K1III,13.536950,1.0
44566,7.69,6.60,0.92,1.110,K2III,16.787720,1.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44568 entries, 0 to 44567
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Vmag    44568 non-null  float64
 1   Plx     44568 non-null  float64
 2   e_Plx   44568 non-null  float64
 3   B-V     44568 non-null  float64
 4   SpType  44568 non-null  object 
 5   Amag    44568 non-null  float64
 6   target  44568 non-null  float64
dtypes: float64(6), object(1)
memory usage: 2.4+ MB


In [5]:
dff = df.drop(columns="SpType").copy()

In [7]:
## df has inifinite values in it and I assume some nulls
dff.replace([np.inf, -np.inf], np.nan, inplace=True)
dff.fillna(dff.mean(), inplace=True)

In [8]:
X = dff.drop(columns=["target","B-V"]).copy()
y = dff["target"]

In [9]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44568 entries, 0 to 44567
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Vmag    44568 non-null  float64
 1   Plx     44568 non-null  float64
 2   e_Plx   44568 non-null  float64
 3   Amag    44568 non-null  float64
dtypes: float64(4)
memory usage: 1.4 MB


In [10]:
X

Unnamed: 0,Vmag,Plx,e_Plx,Amag
0,9.27,21.90,3.10,20.972221
1,8.06,7.75,0.97,17.506509
2,8.55,2.87,1.11,15.839409
3,12.31,18.80,4.99,23.680789
4,8.59,10.76,1.10,18.749061
...,...,...,...,...
44563,8.05,6.24,1.17,17.025923
44564,9.26,2.68,1.36,16.400674
44565,8.79,0.89,1.28,13.536950
44566,7.69,6.60,0.92,16.787720


In [11]:
y.info()

<class 'pandas.core.series.Series'>
RangeIndex: 44568 entries, 0 to 44567
Series name: target
Non-Null Count  Dtype  
--------------  -----  
44568 non-null  float64
dtypes: float64(1)
memory usage: 348.3 KB


In [12]:
y

0        0.0
1        0.0
2        1.0
3        0.0
4        0.0
        ... 
44563    1.0
44564    0.0
44565    1.0
44566    1.0
44567    0.0
Name: target, Length: 44568, dtype: float64

In [13]:
#labelencoder_X=LabelEncoder()
#X[:,4]=labelencoder_X.fit_transform(X[:,4])

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [15]:
#svc 
clfr = SVC(kernel="rbf", random_state=42)
clfr.fit(X_train, y_train)

In [16]:

svc_pred = clfr.predict(X_test)

In [17]:
conf_mtx = confusion_matrix(y_test, svc_pred)
conf_mtx

array([[4376, 2123],
       [ 726, 3917]], dtype=int64)

In [18]:
print(classification_report(y_test, svc_pred, labels = [1, 0]))

              precision    recall  f1-score   support

           1       0.65      0.84      0.73      4643
           0       0.86      0.67      0.75      6499

    accuracy                           0.74     11142
   macro avg       0.75      0.76      0.74     11142
weighted avg       0.77      0.74      0.75     11142



In [19]:
# random forest 
clfr = RandomForestClassifier(n_estimators=500, random_state = 42)
clfr.fit(X_train, y_train)


In [20]:
rf_pred = clfr.predict(X_test)

In [21]:
cof_mtx = confusion_matrix(y_test, rf_pred)
cof_mtx

array([[4849, 1650],
       [1403, 3240]], dtype=int64)

In [22]:
print(classification_report(y_test, rf_pred, labels = [1, 0]))

              precision    recall  f1-score   support

           1       0.66      0.70      0.68      4643
           0       0.78      0.75      0.76      6499

    accuracy                           0.73     11142
   macro avg       0.72      0.72      0.72     11142
weighted avg       0.73      0.73      0.73     11142



In [23]:
#log reggession 
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

log_pred = logreg.predict(X_test)

In [24]:
cof_mtx = confusion_matrix(y_test, log_pred)
cof_mtx

array([[5032, 1467],
       [1586, 3057]], dtype=int64)

In [25]:
print(classification_report(y_test, log_pred, labels = [1, 0]))

              precision    recall  f1-score   support

           1       0.68      0.66      0.67      4643
           0       0.76      0.77      0.77      6499

    accuracy                           0.73     11142
   macro avg       0.72      0.72      0.72     11142
weighted avg       0.73      0.73      0.73     11142

