In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df1 = pd.read_csv('../input/of-genomes-and-genetics-hackerearth-ml-challenge/train.csv')
df2 = pd.read_csv('../input/of-genomes-and-genetics-hackerearth-ml-challenge/test.csv')
df = pd.concat([df1,df2],axis=0)

In [None]:
# dropping irrelevant features 
df = df.drop(['Patient Id','Patient First Name','Family Name',"Father's name",],axis=1)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
# droping na values
df.dropna(inplace=True)

In [None]:
# seprating categorical and numerical data
def sep(df):
    cat_df = []
    num_df = []
    for i in df.columns:
        if df[i].dtypes == 'object':
            cat_df.append(i)
        else:
            num_df.append(i)
    cat_df = df[cat_df]
    num_df = df[num_df]
    return cat_df, num_df
cat_df,num_df = sep(df)
print(cat_df.columns)
print('@@@@')
print(num_df.columns)

In [None]:
# label encoding categorical value and getting name of numerical and categorical label
def leb(df):
    from sklearn.preprocessing import LabelEncoder
    label = LabelEncoder()
    for i in df.columns:
        if df[i].dtypes == 'object':
            df[i] = label.fit_transform(df[i])
    return df
cat_df = leb(cat_df)

In [None]:
# checking no of unique value , if unique value is 1 it will remove that label
def check_value(df):
    for i in df.columns:
        if len(df[i].unique())<2:
            print(f'{i} -> {len(df[i].unique())}')
            df = df.drop(i,axis=1)
    return df

num_df = check_value(num_df)
cat_df = check_value(cat_df)

In [None]:
plt.figure(figsize=(50,50))
sns.heatmap((pd.concat([num_df,cat_df],axis = 1)).corr(),linewidths=0.5,annot=True)

In [None]:
# from heatmap we observe that the feature named "Location of Institute" highly correlate with "Place of birth" 
# so we can drop any one of them which has more number of unique value and that is "Location of Institute" 
# and it is categorical data
cat_df = cat_df.drop(["Location of Institute"],axis = 1)

In [None]:
# feature engnieering
# feature selection in numerical df with numerical filter low variance
# below function will remove the features with low variance (lower the variance lower impact on the targeted variable)
def num_feature_eng(df):
    from sklearn.preprocessing import normalize
    norm = normalize(df)
    scl = pd.DataFrame(norm).var()
    l = []
    for i in range(len(scl)):
        if scl[i]<0.005:
            l.append(df.columns[i])
    return l
lst = num_feature_eng(num_df)
num_df = num_df.drop(lst,axis=1)

In [None]:
num_df

In [None]:
# categorical feature selection with help of chi square test
# there are two targeted variables so we have two find relev

x1 = cat_df.drop(['Genetic Disorder', 'Disorder Subclass'],axis = 1)
y1 = cat_df['Genetic Disorder']
y2 = cat_df['Disorder Subclass']

# this funtion will return the list of relevant features, who have p_value less than 00.5 
def cat_feat_sel(x,y):
    from sklearn.feature_selection import chi2
    l = []
    f_score = chi2(x,y)
    for i in range(len(f_score[1])):
        if f_score[1][i]<0.6:
            l.append(list(x.columns.values.tolist())[i])
    return l

l1 = cat_feat_sel(x1,y1)
X1 = pd.concat([num_df,cat_df[l1]],axis=1)

l2 = cat_feat_sel(x1,y2)
X2 = pd.concat([num_df,cat_df[l2]],axis=1)

In [None]:
# now we have made two sets of feature X1 for "Genetic Disoder" and X2 for 'Disorder Subclass'
X1.head()

In [None]:
X2.head()

In [None]:
# now its time slice data into test and train
from sklearn.model_selection import train_test_split
X1_train, X1_test, y1_train, y1_test = train_test_split(X1,y1,test_size = 0.3)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2,y2,test_size = 0.3)

In [None]:
# this function will return best classifier model on the basis of accuracy score
def best_model(X_train, X_test, y_train, y_test):
    from sklearn.linear_model import LogisticRegression,SGDClassifier
    from sklearn.naive_bayes import GaussianNB
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.svm import SVC
    from sklearn.metrics import accuracy_score
    l = []
    
    m1 = LogisticRegression()
    m1.fit(X_train,y_train)
    y_p1 = m1.predict([X_test])
    a1 = accuracy_score(y_test,y_p1)
    l.append([a1,'Logistic regression'])
    
    m2 = SGDClassifier()
    m2.fit(X_train,y_train)
    y_p2 = m2.predict([X_test])
    a2 = accuracy_score(y_test,y_p2)
    l.append([a2,"SGD"])
    
    m3 = GaussianNB()
    m3.fit(X_train,y_train)
    y_p3 = m3.predict([X_test])
    a3 = accuracy_score(y_test,y_p3)
    l.append([a3,'Gaussion NB'])
    
    m4 = KNeighborsClassifier()
    m4.fit(X_train,y_train)
    y_p4 = m1.predict([X_test])
    a4 = accuracy_score(y_test,y_p4)
    l.append([a4,'KNN'])
    
    m5 = DecisionTreeClassifier()
    m5.fit(X_train,y_train)
    y_p5 = m1.predict([X_test])
    a5 = accuracy_score(y_test,y_p5)
    l.append([a5,'Decession tree'])
    
    m6 = RandomForestClassifier()
    m6.fit(X_train,y_train)
    y_p6 = m1.predict([X_test])
    a6 = accuracy_score(y_test,y_p6)
    l.append([a6,'Random forest'])
    
    m7 = SVC()
    m7.fit(X_train,y_train)
    y_p7 = m1.predict([X_test])
    a7 = accuracy_score(y_test,y_p7)
    l.append([a7,'SVC'])
    return l
best_model(X1_train, X1_test, y1_train, y1_test)

In [None]:
from sklearn.linear_model import LogisticRegression
m1 = LogisticRegression()
m1.fit(X1_train,y1_train)
y_p1 = m1.predict([X1_test])
a1 = accuracy_score(y1_test,y_p1)
a1

In [None]:
y1_train