In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

In [8]:
df = sns.load_dataset('penguins')

In [21]:
from sklearn.cluster import DBSCAN, AgglomerativeClustering, KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [16]:
df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


In [57]:
categorical_columns = df.select_dtypes(exclude=np.number).columns
numerical_columns = df.select_dtypes(include=np.number).columns

categorical_transformer = Pipeline([('cat_impute', SimpleImputer(strategy='most_frequent')),
                                   ('oh', OneHotEncoder())])

numerical_transformer = Pipeline([('impute', SimpleImputer()),
                                   ('scale', StandardScaler())])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_columns),
    ('cat', categorical_transformer, categorical_columns)]
                                )

In [59]:
si = SimpleImputer(strategy='most_frequent')
df[categorical_columns] = si.fit_transform(df[categorical_columns])

In [64]:
df[categorical_columns]

Unnamed: 0,species,island,sex
0,Adelie,Torgersen,Male
1,Adelie,Torgersen,Female
2,Adelie,Torgersen,Female
3,Adelie,Torgersen,Male
4,Adelie,Torgersen,Female
...,...,...,...
339,Gentoo,Biscoe,Male
340,Gentoo,Biscoe,Female
341,Gentoo,Biscoe,Male
342,Gentoo,Biscoe,Female


In [66]:
le = LabelEncoder()
df['species'] = le.fit_transform(df['species'])
df['island'] = le.fit_transform(df['island'])
df['sex'] = le.fit_transform(df['sex'])

In [68]:
preprocessor.fit_transform(df)

array([[-0.88708123,  0.78774251, -1.42248782, ...,  1.        ,
         0.        ,  1.        ],
       [-0.81349399,  0.12655633, -1.06535169, ...,  1.        ,
         1.        ,  0.        ],
       [-0.66631952,  0.43171918, -0.42250666, ...,  1.        ,
         1.        ,  0.        ],
       ...,
       [ 1.1917582 , -0.73807176,  1.50602843, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.23512413, -1.19581604,  0.79175618, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.09977416, -0.53462985,  0.8631834 , ...,  0.        ,
         0.        ,  1.        ]])

In [52]:
sample = preprocessor.fit_transform(df)

In [53]:
pd.DataFrame(sample)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-8.870812e-01,7.877425e-01,-1.422488,-0.565789,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,-8.134940e-01,1.265563e-01,-1.065352,-0.503168,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,-6.663195e-01,4.317192e-01,-0.422507,-1.192003,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,-1.307172e-15,1.806927e-15,0.000000,0.000000,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,-1.328605e+00,1.092905e+00,-0.565361,-0.941517,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
339,-1.307172e-15,1.806927e-15,0.000000,0.000000,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
340,5.294731e-01,-1.450118e+00,1.006038,0.811880,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
341,1.191758e+00,-7.380718e-01,1.506028,1.939064,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
342,2.351241e-01,-1.195816e+00,0.791756,1.250229,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0


In [54]:
ct = ColumnTransformer([
    ('oh', OneHotEncoder(), categorical_columns),
    ('impute_cat', SimpleImputer(strategy='most_frequent'), categorical_columns),
    ('impute_num', SimpleImputer(),numerical_columns),
                       ])

In [55]:
dfdf = ct.fit_transform(df)

In [56]:
dfdf = pd.DataFrame(dfdf)

In [33]:
dfdf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,Adelie,Torgersen,Male,39.1,18.7,181.0,3750.0
1,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,Adelie,Torgersen,Female,39.5,17.4,186.0,3800.0
2,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,Adelie,Torgersen,Female,40.3,18.0,195.0,3250.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,Adelie,Torgersen,Male,43.92193,17.15117,200.915205,4201.754386
4,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,Adelie,Torgersen,Female,36.7,19.3,193.0,3450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,Gentoo,Biscoe,Male,43.92193,17.15117,200.915205,4201.754386
340,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,Gentoo,Biscoe,Female,46.8,14.3,215.0,4850.0
341,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,Gentoo,Biscoe,Male,50.4,15.7,222.0,5750.0
342,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,Gentoo,Biscoe,Female,45.2,14.8,212.0,5200.0


In [13]:
X = df.drop('species', axis=1)
y = df['species']

X_train, X_text, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [14]:
X_train

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
98,Dream,33.1,16.1,178.0,2900.0,Female
114,Biscoe,39.6,20.7,191.0,3900.0,Female
118,Torgersen,35.7,17.0,189.0,3350.0,Female
303,Biscoe,50.0,15.9,224.0,5350.0,Male
343,Biscoe,49.9,16.1,213.0,5400.0,Male
...,...,...,...,...,...,...
10,Torgersen,37.8,17.1,186.0,3300.0,
245,Biscoe,46.1,15.1,215.0,5100.0,Male
75,Torgersen,42.8,18.5,195.0,4250.0,Male
82,Torgersen,36.7,18.8,187.0,3800.0,Female


In [78]:
s = 'python is simple'

In [79]:
dict_ = {}

for char in s:
    if char in dict_.keys():
        dict_[char] = dict_[char] + 1
    else:
        dict_[char] = 1