In [1]:
!kaggle datasets download muhammadshahidazeem/panic-disorder-detection-dataset

Downloading panic-disorder-detection-dataset.zip to /home/poli/Documents/data_science/ml/09
100%|██████████████████████████████████████| 1.50M/1.50M [00:01<00:00, 1.03MB/s]
100%|███████████████████████████████████████| 1.50M/1.50M [00:01<00:00, 997kB/s]


In [2]:
!unzip panic-disorder-detection-dataset.zip

Archive:  panic-disorder-detection-dataset.zip
  inflating: panic_disorder_dataset_testing.csv  
  inflating: panic_disorder_dataset_training.csv  


In [3]:
import numpy as np
import pandas as pd
import dill

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [4]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, x, y=None):
        return self

    def transform(self, x, y=None):
        return x[self.column]

    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional 
    transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, x):
        return x[[self.key]]
    

class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, x, y=None):
        self.columns = [col for col in pd.get_dummies(x, prefix=self.key, dtype=int).columns]
        return self

    def transform(self, x):
        x = pd.get_dummies(x, prefix=self.key, dtype=int)
        test_columns = [col for col in x.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                x[col_] = 0
        # Ensure returning numpy array, not dataframe
        return x[self.columns].to_numpy()

In [5]:
df = pd.read_csv('panic_disorder_dataset_testing.csv', index_col='Participant ID')
df.head()

Unnamed: 0_level_0,Age,Gender,Family History,Personal History,Current Stressors,Symptoms,Severity,Impact on Life,Demographics,Medical History,Psychiatric History,Substance Use,Coping Mechanisms,Social Support,Lifestyle Factors,Panic Disorder Diagnosis
Participant ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,41,Male,Yes,No,High,Shortness of breath,Mild,Mild,Urban,Diabetes,Bipolar disorder,Alcohol,Seeking therapy,Low,Exercise,0
2,20,Female,Yes,No,Low,Shortness of breath,Mild,Significant,Urban,Asthma,Anxiety disorder,Drugs,Exercise,High,Diet,0
3,32,Male,Yes,Yes,High,Panic attacks,Severe,Mild,Rural,Heart disease,Bipolar disorder,Drugs,Meditation,Moderate,Exercise,0
4,41,Female,Yes,Yes,Moderate,Shortness of breath,Moderate,Significant,Urban,Heart disease,Anxiety disorder,,Exercise,High,Sleep quality,0
5,36,Female,Yes,No,High,Chest pain,Severe,Significant,Rural,Asthma,Depressive disorder,,Seeking therapy,Low,Exercise,0


In [6]:
df.info()
# мы не станем заполнять пропуски, во всех трех столбцах они имеют значение "нет". one hot 
# encoder в дамми-разбивке не будет учитывать значение nan и не добавит его в список признаков, 
# проставив нули во всех имеющихся признаках, что собственно нам и нужно

<class 'pandas.core.frame.DataFrame'>
Index: 20000 entries, 1 to 20000
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       20000 non-null  int64 
 1   Gender                    20000 non-null  object
 2   Family History            20000 non-null  object
 3   Personal History          20000 non-null  object
 4   Current Stressors         20000 non-null  object
 5   Symptoms                  20000 non-null  object
 6   Severity                  20000 non-null  object
 7   Impact on Life            20000 non-null  object
 8   Demographics              20000 non-null  object
 9   Medical History           14999 non-null  object
 10  Psychiatric History       15011 non-null  object
 11  Substance Use             13383 non-null  object
 12  Coping Mechanisms         20000 non-null  object
 13  Social Support            20000 non-null  object
 14  Lifestyle Factors         2

In [7]:
df['Panic Disorder Diagnosis'].value_counts()

Panic Disorder Diagnosis
0    19159
1      841
Name: count, dtype: int64

In [8]:
x_train, x_test, y_train, y_test = train_test_split(df.drop(['Panic Disorder Diagnosis'], 
                                                            axis=1), 
                                                    df['Panic Disorder Diagnosis'], 
                                                    random_state=42, 
                                                    stratify=df['Panic Disorder Diagnosis'])

x_test.to_csv('x_test.csv', index=None)
y_test.to_csv('y_test.csv', index=None)

x_train.to_csv('x_train.csv', index=None)
y_train.to_csv('y_train.csv', index=None)

In [9]:
print(f'Gender values: {", ".join(df["Gender"].unique())}')
print(f'Family History values: {", ".join(df["Family History"].unique())}')
print(f'Personal History values: {", ".join(df["Personal History"].unique())}')
print(f'Current Stressors values: {", ".join(df["Current Stressors"].unique())}')
print(f'Symptoms values: {", ".join(df["Symptoms"].unique())}')
print(f'Severity values: {", ".join(df["Severity"].unique())}')
print(f'Impact on Life values: {", ".join(df["Impact on Life"].unique())}')
print(f'Demographics values: {", ".join(df["Demographics"].unique())}')
print(f'Medical History values: {", ".join(map(str, df["Medical History"].unique()))}')
print(f'Psychiatric History values: {", ".join(map(str, df["Psychiatric History"].unique()))}')
print(f'Substance Use values: {", ".join(map(str, df["Substance Use"].unique()))}')
print(f'Coping Mechanisms values: {", ".join(df["Coping Mechanisms"].unique())}')
print(f'Social Support values: {", ".join(df["Social Support"].unique())}')
print(f'Lifestyle Factors values: {", ".join(df["Lifestyle Factors"].unique())}')

Gender values: Male, Female
Family History values: Yes, No
Personal History values: No, Yes
Current Stressors values: High, Low, Moderate
Symptoms values: Shortness of breath, Panic attacks, Chest pain, Dizziness, Fear of losing control
Severity values: Mild, Severe, Moderate
Impact on Life values: Mild, Significant, Moderate
Demographics values: Urban, Rural
Medical History values: Diabetes, Asthma, Heart disease, nan
Psychiatric History values: Bipolar disorder, Anxiety disorder, Depressive disorder, nan
Substance Use values: Alcohol, Drugs, nan
Coping Mechanisms values: Seeking therapy, Exercise, Meditation, Socializing
Social Support values: Low, High, Moderate
Lifestyle Factors values: Exercise, Diet, Sleep quality


In [10]:
num_features = ['Age']
cat_features = ['Gender', 'Family History', 'Personal History', 'Current Stressors', 'Symptoms', 
                'Severity', 'Impact on Life', 'Demographics', 'Medical History', 
                'Psychiatric History', 'Substance Use', 'Coping Mechanisms', 'Social Support', 
                'Lifestyle Factors']

In [11]:
final_transformers = []

for cat_col in cat_features:
    cat_transformer = Pipeline([('selector', FeatureSelector(column=cat_col)), 
                                ('ohe', OHEEncoder(key=cat_col))])
    final_transformers.append((cat_col, cat_transformer))

for num_col in num_features:
    cont_transformer = Pipeline([('selector', NumberSelector(key=num_col)), 
                                 ('Scale', StandardScaler())])
    final_transformers.append((num_col, cont_transformer))

In [12]:
final_transformers

[('Gender',
  Pipeline(steps=[('selector', FeatureSelector(column='Gender')),
                  ('ohe', OHEEncoder(key='Gender'))])),
 ('Family History',
  Pipeline(steps=[('selector', FeatureSelector(column='Family History')),
                  ('ohe', OHEEncoder(key='Family History'))])),
 ('Personal History',
  Pipeline(steps=[('selector', FeatureSelector(column='Personal History')),
                  ('ohe', OHEEncoder(key='Personal History'))])),
 ('Current Stressors',
  Pipeline(steps=[('selector', FeatureSelector(column='Current Stressors')),
                  ('ohe', OHEEncoder(key='Current Stressors'))])),
 ('Symptoms',
  Pipeline(steps=[('selector', FeatureSelector(column='Symptoms')),
                  ('ohe', OHEEncoder(key='Symptoms'))])),
 ('Severity',
  Pipeline(steps=[('selector', FeatureSelector(column='Severity')),
                  ('ohe', OHEEncoder(key='Severity'))])),
 ('Impact on Life',
  Pipeline(steps=[('selector', FeatureSelector(column='Impact on Life')),
   

In [13]:
feats = FeatureUnion(final_transformers)

In [14]:
pipeline = Pipeline([('features', feats), 
                     ('classifier', SVC(probability=True))])

In [15]:
pipeline.fit(x_train, y_train)

In [16]:
with open('svc_clf.dill', 'wb') as file:
    dill.dump(pipeline, file)