In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

df = pd.read_csv('alz.csv')
df.head()

Unnamed: 0,Country,Age,Gender,Education Level,BMI,Physical Activity Level,Smoking Status,Alcohol Consumption,Diabetes,Hypertension,...,Dietary Habits,Air Pollution Exposure,Employment Status,Marital Status,Genetic Risk Factor (APOE-ε4 allele),Social Engagement Level,Income Level,Stress Levels,Urban vs Rural Living,Alzheimer’s Diagnosis
0,Spain,90,Male,1,33.0,Medium,Never,Occasionally,No,No,...,Healthy,High,Retired,Single,No,Low,Medium,High,Urban,No
1,Argentina,72,Male,7,29.9,Medium,Former,Never,No,No,...,Healthy,Medium,Unemployed,Widowed,No,High,Low,High,Urban,No
2,South Africa,86,Female,19,22.9,High,Current,Occasionally,No,Yes,...,Average,Medium,Employed,Single,No,Low,Medium,High,Rural,No
3,China,53,Male,17,31.2,Low,Never,Regularly,Yes,No,...,Healthy,Medium,Retired,Single,No,High,Medium,Low,Rural,No
4,Sweden,58,Female,3,30.0,High,Former,Never,Yes,No,...,Unhealthy,High,Employed,Married,No,Low,Medium,High,Rural,No


In [2]:
def grab_cols(df):
    num_cols = list(df.select_dtypes(include="number"))
    cat_cols = [col for col in df.columns if col not in num_cols]
    num_but_cat = [col for col in num_cols if df[col].nunique()<25]
    cat_but_car = [col for col in cat_cols if df[col].nunique() >25]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]
    num_cols = [col for col in num_cols if col not in num_but_cat]
    print(f"cat_cols = {cat_cols}")
    print(f"num_cols = {num_cols}")
    print(f"num_but_cat = {num_but_cat}")
    print(f"cat_but_car= {cat_but_car}")
    return cat_cols,num_cols,cat_but_car,num_but_cat
cat_cols,num_cols,cat_but_car,num_but_cat = grab_cols(df)

cat_cols = ['Country', 'Gender', 'Physical Activity Level', 'Smoking Status', 'Alcohol Consumption', 'Diabetes', 'Hypertension', 'Cholesterol Level', 'Family History of Alzheimer’s', 'Depression Level', 'Sleep Quality', 'Dietary Habits', 'Air Pollution Exposure', 'Employment Status', 'Marital Status', 'Genetic Risk Factor (APOE-ε4 allele)', 'Social Engagement Level', 'Income Level', 'Stress Levels', 'Urban vs Rural Living', 'Alzheimer’s Diagnosis', 'Education Level']
num_cols = ['Age', 'BMI', 'Cognitive Test Score']
num_but_cat = ['Education Level']
cat_but_car= []


In [3]:
yes_counts = df[df['Alzheimer’s Diagnosis'] == 'Yes']['Country'].value_counts()

total_counts = df['Country'].value_counts()

percentages = (yes_counts / total_counts * 100).sort_values(ascending=False)
percentages

Country
Russia          50.449974
India           50.334135
South Africa    49.521277
Brazil          48.580359
Mexico          48.471373
South Korea     41.586281
Saudi Arabia    41.425451
Germany         41.082217
UK              40.372501
Australia       40.269342
Argentina       40.203699
France          40.053908
Spain           39.994592
Italy           39.339420
USA             38.855088
China           38.641425
Norway          34.970318
Sweden          34.209813
Canada          34.141741
Japan           33.937617
Name: count, dtype: float64

In [5]:
def outliers(df,variable):
    q1= df[variable].quantile(0.2)
    q3 = df[variable].quantile(0.8)
    iqr = q3 - q1
    lower_lim = q1 - 1.5*iqr
    upper_lim = q3 + 1.5*iqr
    return lower_lim,upper_lim

def check_outliers(df,variable):
    lower_lim,upper_lim = outliers(df,variable)
    if df.loc[(df[variable]<lower_lim) | (df[variable]>upper_lim)].any(axis=None):
        return True
    else:
        return False
    
for col in num_cols:
    print(col,check_outliers(df,col))

Age False
BMI False
Cognitive Test Score False


In [7]:
df['Alzheimer’s Diagnosis']

0        No
1        No
2        No
3        No
4        No
         ..
74278    No
74279    No
74280    No
74281    No
74282    No
Name: Alzheimer’s Diagnosis, Length: 74283, dtype: object

In [8]:
df['Alzheimer’s Diagnosis'] = df['Alzheimer’s Diagnosis'].map({'Yes':1,'No':0})

In [None]:
def ohe(dataframe,cat_cols):
    dataframe = pd.get_dummies(dataframe,columns=cat_cols,drop_first=True,dtype=int)
    return dataframe

In [9]:
def rare_analysis(df,target,cat_cols):
    for col in cat_cols:
        print(col,":",len(df[col].value_counts()))
        df_ratio = (df[col].value_counts() /len(df)).sort_values(ascending=False)
        df_count = df[col].value_counts().loc[df_ratio.index]
        df_target_mean = df.groupby(col)[target].mean().loc[df_ratio.index]
        print(pd.DataFrame({"Ratio": df_ratio,
              "Count": df_count,
              "Target_Mean": df_target_mean}))
        print("-------------------------------------------------------")
        
rare_analysis(df,"Alzheimer’s Diagnosis",cat_cols)

Country : 20
                 Ratio  Count  Target_Mean
Country                                   
Brazil        0.051681   3839     0.485804
Germany       0.051250   3807     0.410822
Australia     0.050981   3787     0.402693
Russia        0.050860   3778     0.504500
South Africa  0.050617   3760     0.495213
Japan         0.050496   3751     0.339376
India         0.050361   3741     0.503341
South Korea   0.050240   3732     0.415863
Argentina     0.050227   3731     0.402037
Italy         0.050133   3724     0.393394
Canada        0.049958   3711     0.341417
France        0.049944   3710     0.400539
Norway        0.049890   3706     0.349703
Spain         0.049783   3698     0.399946
Sweden        0.049661   3689     0.342098
Saudi Arabia  0.049298   3662     0.414255
UK            0.049150   3651     0.403725
USA           0.048679   3616     0.388551
Mexico        0.048436   3598     0.484714
China         0.048356   3592     0.386414
-----------------------------------------

In [23]:
df.groupby(["Depression Level","Hypertension"])["Age"].transform("mean")

0        72.042994
1        72.042994
2        72.167054
3        71.870761
4        72.022621
           ...    
74278    71.870761
74279    72.042994
74280    72.042994
74281    72.022621
74282    72.042994
Name: Age, Length: 74283, dtype: float64

In [22]:
df.groupby(["Depression Level","Sleep Quality"])["Age"].mean()

Depression Level  Sleep Quality
High              Average          72.002057
                  Good             72.151880
                  Poor             72.041497
Low               Average          72.143205
                  Good             71.944179
                  Poor             72.019620
Medium            Average          71.579516
                  Good             71.974365
                  Poor             71.824844
Name: Age, dtype: float64

In [None]:
def high_correlated_cols(dataframe, plot=False, corr_th=0.9):
    corr = dataframe.corr()
    cor_matrix = corr.abs()
    upper_triangle_matrix = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(bool))
    drop_list = [col for col in upper_triangle_matrix.columns if any(upper_triangle_matrix[col] > corr_th)]
    
    if plot:
        import seaborn as sns
        import matplotlib.pyplot as plt
        sns.set(rc={'figure.figsize': (15, 15)})
        
        # upper_triangle_matrix'ı görselleştirmek için aşağıdaki satırı kullanabilirsiniz.
        sns.heatmap(upper_triangle_matrix, cmap="RdBu", annot=True, cbar=False, linewidths=0.5, linecolor='gray')
        plt.show()
    return drop_list
high_correlated_cols(dff.isnull().astype(int),plot=True)


In [None]:
def label_encoder(df,binary_col):
    le = LabelEncoder()
    df[binary_col] = le.fit_transform(df[binary_col])
    
binary_cols = [col for col in df.columns if not np.issubdtype(df[col],np.number) and df[col].nunique()==2]
binary_cols
for col in binary_cols:
    label_encoder(df,col)

In [None]:
df.groupby("new_title").agg({"Age":["mean","count"]})

In [None]:
df.loc[(df["sibsp"]+df["parch"])>0,"is_alone"] = 0
df.loc[(df["sibsp"]+df["parch"]==0),"is_alone"] = 1

df.loc[(df["sex"]=="male") & (df["age"]<=21),"new_sex_cat"] ="youngmale"
df.loc[(df["sex"]=="male") & (df["age"]>21) & (df["age"]<=50),"new_sex_cat"] = "maturemale"
df.loc[(df["sex"]=="male") & (df["age"]>50),"new_sex_cat"] = "seniormale

In [None]:
from feature_engine.encoding import OneHotEncoder
ohe = OneHotEncoder(variables=ohe_cols,drop_last=True)
df3=ohe.fit_transform(df3)

In [None]:
df.groupby("Description")[["Quantity"]].sum().sort_values(by="Quantity",ascending =False)

In [None]:
df.loc[((df["age"]>50) & (df["sex"]=="female")),["sibsp","pclass","sex"]].head()

In [None]:
df["sex"].value_counts().plot(kind="bar",color="orange")

In [None]:
df.total_bill.hist(color ="orange",bins=60)

In [None]:
list(df.select_dtypes(exclude="number"))

In [None]:
y_train.groupby(X_train["Neighborhood"]).mean().plot()

In [None]:
for var in X_train.columns:
    
    fig=plt.figure()
    fig = y_train.groupby(X_train[var]).mean().plot(color ="orange")
    fig.set_title(var)
    fig.set_ylabel("mean_saleprice")
    plt.show()

In [None]:
for var in X_train.columns:
    
    plt.figure()
    fig = y_train.groupby(X_train[var]).mean().plot(color = "orange")
    fig.set_title(f"monotonic between {var} and Saleprice ")
    fig.set_ylabel("mean saleprice")
    plt.show()

In [None]:
ordinal_enc = OrdinalEncoder(encoding_method="ordered",
                             variables=list(X_train.columns))

ordinal_enc.fit(X_train, y_train)

In [None]:
ordered_labels = y_train.groupby(X_train["Cabin_1"]).mean().to_dict()
ordered_labels

In [None]:
from feature_engine.encoding import MeanEncoder
mean_enc = MeanEncoder(variables =list(X_train.columns),smoothing="auto")
mean_enc.fit(X_train, y_train)
mean_enc.encoder_dict_

In [None]:
for col in X_test.columns:
    fig = plt.figure()
    fig = y_test.groupby(X_test[col]).mean().plot(color = "orange")
    plt.show()

In [None]:
from category_encoders.target_encoder import TargetEncoder

target_enc = TargetEncoder(cols=list(X_train.columns),
                           smoothing=10)

target_enc.fit(X_train,y_train)
target_enc.mapping

In [None]:
for col in cat_but_car:
    temp_df = pd.Series(X_train[col].value_counts()/len(X_train))
    fig = temp_df.sort_values(ascending=False).plot.bar(color="orange")
    fig.set_xlabel(col)
    fig.axhline(y=0.05,color="red")
    fig.set_ylabel("percentage")
    plt.show()

In [None]:
dft.hist(bins=50,color='orange',figsize=(12,6))