# Feature Extraction (Özellik Çıkarımı)

# Binary Features: Flag, Bool, True-False

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
# pip install missingno
import missingno as msno
from datetime import date
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder,StandardScaler , RobustScaler

In [2]:
def load():
    data = pd.read_csv("datasets/titanic.csv")
    return data

In [3]:
df = load()

In [4]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


  Özellik Çıkarımı ( Feature Extraction ) : Ham veriden değişken üretmek
  
  
1 - Yapısal veriden değişken türetmek
Mevcut değişken üzerinden veri türetmek
2 - Yapısal olmayan veriden değişken türetmek
Yazı, ses, görüntü gibi veri üzerinden veri türetmek
  
  Binary Features : flag, bool , true- false
  1- 0 şeklinde var olan değişken üzerinden yeni değişken üretmektir
  
  

In [9]:
df["NEW_CABIN_BOOL"] = df["Cabin"].notnull().astype('int')

df["NEW_CABIN_BOOL"]

0      0
1      1
2      0
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: NEW_CABIN_BOOL, Length: 891, dtype: int32

In [10]:
df.groupby("NEW_CABIN_BOOL").agg({"Survived": "mean"})

Unnamed: 0_level_0,Survived
NEW_CABIN_BOOL,Unnamed: 1_level_1
0,0.299854
1,0.666667


In [11]:

from statsmodels.stats.proportion import proportions_ztest

In [12]:
test_stat, pvalue = proportions_ztest(count=[df.loc[df["NEW_CABIN_BOOL"] == 1, "Survived"].sum(),
                                             df.loc[df["NEW_CABIN_BOOL"] == 0, "Survived"].sum()],

                                      nobs=[df.loc[df["NEW_CABIN_BOOL"] == 1, "Survived"].shape[0],
                                            df.loc[df["NEW_CABIN_BOOL"] == 0, "Survived"].shape[0]])

print('Test Stat = %.4f, p-value = %.4f' % (test_stat, pvalue))


Test Stat = 9.4597, p-value = 0.0000


In [13]:
df.loc[((df['SibSp'] + df['Parch']) > 0), "NEW_IS_ALONE"] = "NO"

In [14]:
df.loc[((df['SibSp'] + df['Parch']) == 0), "NEW_IS_ALONE"] = "YES"

In [15]:
df.groupby("NEW_IS_ALONE").agg({"Survived": "mean"})

Unnamed: 0_level_0,Survived
NEW_IS_ALONE,Unnamed: 1_level_1
NO,0.50565
YES,0.303538


In [16]:
test_stat, pvalue = proportions_ztest(count=[df.loc[df["NEW_IS_ALONE"] == "YES", "Survived"].sum(),
                                             df.loc[df["NEW_IS_ALONE"] == "NO", "Survived"].sum()],

                                      nobs=[df.loc[df["NEW_IS_ALONE"] == "YES", "Survived"].shape[0],
                                            df.loc[df["NEW_IS_ALONE"] == "NO", "Survived"].shape[0]])

print('Test Stat = %.4f, p-value = %.4f' % (test_stat, pvalue))

Test Stat = -6.0704, p-value = 0.0000


# Text'ler Üzerinden Özellik Türetmek

In [17]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,NEW_CABIN_BOOL,NEW_IS_ALONE
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,NO
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,NO
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,YES
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,NO
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,YES


In [19]:
# Letter Count
df["NEW_NAME_COUNT"] = df["Name"].str.len()
df["NEW_NAME_COUNT"]

0      23
1      51
2      22
3      44
4      24
       ..
886    21
887    28
888    40
889    21
890    19
Name: NEW_NAME_COUNT, Length: 891, dtype: int64

In [20]:
# Word Count
df["NEW_NAME_WORD_COUNT"] = df["Name"].apply(lambda x: len(str(x).split(" ")))

In [21]:
# Özel Yapıları Yakalamak

In [23]:
df["NEW_NAME_DR"] = df["Name"].apply(lambda x: len([x for x in x.split() if x.startswith("Dr")]))
df["NEW_NAME_DR"]

0      0
1      0
2      0
3      0
4      0
      ..
886    0
887    0
888    0
889    0
890    0
Name: NEW_NAME_DR, Length: 891, dtype: int64

In [24]:
df.groupby("NEW_NAME_DR").agg({"Survived": ["mean","count"]})


Unnamed: 0_level_0,Survived,Survived
Unnamed: 0_level_1,mean,count
NEW_NAME_DR,Unnamed: 1_level_2,Unnamed: 2_level_2
0,0.38252,881
1,0.5,10


# Regex ile Değişken Türetmek

In [25]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,NEW_CABIN_BOOL,NEW_IS_ALONE,NEW_NAME_COUNT,NEW_NAME_WORD_COUNT,NEW_NAME_DR
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,NO,23,4,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,NO,51,7,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,YES,22,3,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,NO,44,7,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,YES,24,4,0


In [50]:
df['NEW_TITLE'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
df['NEW_TITLE']

0        Mr
1       Mrs
2      Miss
3       Mrs
4        Mr
       ... 
886     Rev
887    Miss
888    Miss
889      Mr
890      Mr
Name: NEW_TITLE, Length: 891, dtype: object

In [51]:
df[["NEW_TITLE", "Survived", "Age"]].groupby(["NEW_TITLE"]).agg({"Survived": "mean", "Age": ["count", "mean"]})

Unnamed: 0_level_0,Survived,Age,Age
Unnamed: 0_level_1,mean,count,mean
NEW_TITLE,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Capt,0.0,1,70.0
Col,0.5,2,58.0
Countess,1.0,1,33.0
Don,0.0,1,40.0
Dr,0.428571,6,42.0
Jonkheer,0.0,1,38.0
Lady,1.0,1,48.0
Major,0.5,2,48.5
Master,0.575,36,4.574167
Miss,0.697802,146,21.773973


# Date Değişkenleri Üretmek

In [59]:
dff = pd.read_csv("datasets/course_reviews.csv")

In [60]:
dff.head()

Unnamed: 0,Rating,Timestamp,Enrolled,Progress,Questions Asked,Questions Answered
0,5.0,2021-02-05 07:45:55,2021-01-25 15:12:08,5.0,0.0,0.0
1,5.0,2021-02-04 21:05:32,2021-02-04 20:43:40,1.0,0.0,0.0
2,4.5,2021-02-04 20:34:03,2019-07-04 23:23:27,1.0,0.0,0.0
3,5.0,2021-02-04 16:56:28,2021-02-04 14:41:29,10.0,0.0,0.0
4,4.0,2021-02-04 15:00:24,2020-10-13 03:10:07,10.0,0.0,0.0


In [61]:
dff.info

<bound method DataFrame.info of       Rating            Timestamp             Enrolled  Progress  \
0        5.0  2021-02-05 07:45:55  2021-01-25 15:12:08       5.0   
1        5.0  2021-02-04 21:05:32  2021-02-04 20:43:40       1.0   
2        4.5  2021-02-04 20:34:03  2019-07-04 23:23:27       1.0   
3        5.0  2021-02-04 16:56:28  2021-02-04 14:41:29      10.0   
4        4.0  2021-02-04 15:00:24  2020-10-13 03:10:07      10.0   
...      ...                  ...                  ...       ...   
4318     5.0  2019-05-17 09:51:44  2019-05-17 09:08:53      34.0   
4319     5.0  2019-05-16 21:27:05  2019-05-16 20:32:15       5.0   
4320     5.0  2019-05-16 20:22:26  2019-05-16 20:21:19       1.0   
4321     5.0  2019-05-16 19:49:07  2019-05-16 19:47:29       1.0   
4322     5.0  2019-05-16 13:40:35  2019-05-15 14:10:24      56.0   

      Questions Asked  Questions Answered  
0                 0.0                 0.0  
1                 0.0                 0.0  
2                 0

In [63]:
# timestamp değişkeni object türdeydi datetime tipine dönüştürdük

In [64]:
dff['Timestamp'] = pd.to_datetime(dff["Timestamp"], format="%Y-%m-%d")

In [66]:
# year
dff['year'] = dff['Timestamp'].dt.year
dff['year'] 

0       2021
1       2021
2       2021
3       2021
4       2021
        ... 
4318    2019
4319    2019
4320    2019
4321    2019
4322    2019
Name: year, Length: 4323, dtype: int64

In [67]:
# month
dff['month'] = dff['Timestamp'].dt.month
dff['month']

0       2
1       2
2       2
3       2
4       2
       ..
4318    5
4319    5
4320    5
4321    5
4322    5
Name: month, Length: 4323, dtype: int64

In [69]:

# year diff  = yıl farkı
dff['year_diff'] = date.today().year - dff['Timestamp'].dt.year
dff['year_diff']

0       1
1       1
2       1
3       1
4       1
       ..
4318    3
4319    3
4320    3
4321    3
4322    3
Name: year_diff, Length: 4323, dtype: int64

In [70]:
# month diff (iki tarih arasındaki ay farkı): yıl farkı + ay farkı
dff['month_diff'] = (date.today().year - dff['Timestamp'].dt.year) * 12 + date.today().month - dff['Timestamp'].dt.month
dff['month_diff']

0       17
1       17
2       17
3       17
4       17
        ..
4318    38
4319    38
4320    38
4321    38
4322    38
Name: month_diff, Length: 4323, dtype: int64

In [72]:
# day name
dff['day_name'] = dff['Timestamp'].dt.day_name()
dff['day_name']

0         Friday
1       Thursday
2       Thursday
3       Thursday
4       Thursday
          ...   
4318      Friday
4319    Thursday
4320    Thursday
4321    Thursday
4322    Thursday
Name: day_name, Length: 4323, dtype: object

In [73]:
dff.head()

Unnamed: 0,Rating,Timestamp,Enrolled,Progress,Questions Asked,Questions Answered,year,month,year_diff,month_diff,day_name
0,5.0,2021-02-05 07:45:55,2021-01-25 15:12:08,5.0,0.0,0.0,2021,2,1,17,Friday
1,5.0,2021-02-04 21:05:32,2021-02-04 20:43:40,1.0,0.0,0.0,2021,2,1,17,Thursday
2,4.5,2021-02-04 20:34:03,2019-07-04 23:23:27,1.0,0.0,0.0,2021,2,1,17,Thursday
3,5.0,2021-02-04 16:56:28,2021-02-04 14:41:29,10.0,0.0,0.0,2021,2,1,17,Thursday
4,4.0,2021-02-04 15:00:24,2020-10-13 03:10:07,10.0,0.0,0.0,2021,2,1,17,Thursday


# Feature Interactions (Özellik Etkileşimleri)

In [80]:
df = load()

In [81]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [82]:
df["NEW_AGE_PCLASS"] = df["Age"] * df["Pclass"]
df["NEW_AGE_PCLASS"]

0       66.0
1       38.0
2       78.0
3       35.0
4      105.0
       ...  
886     54.0
887     19.0
888      NaN
889     26.0
890     96.0
Name: NEW_AGE_PCLASS, Length: 891, dtype: float64

In [83]:
df["NEW_FAMILY_SIZE"] = df["SibSp"] + df["Parch"] + 1
df["NEW_FAMILY_SIZE"]

0      2
1      2
2      1
3      2
4      1
      ..
886    1
887    1
888    4
889    1
890    1
Name: NEW_FAMILY_SIZE, Length: 891, dtype: int64

In [88]:
df.loc[(df['Sex'] == 'male') & (df['Age'] <= 21), 'NEW_SEX_CAT'] = 'youngmale'

In [89]:
df.loc[(df['Sex'] == 'male') & (df['Age'] > 21) & (df['Age'] < 50), 'NEW_SEX_CAT'] = 'maturemale'

In [91]:

df.loc[(df['Sex'] == 'male') & (df['Age'] >= 50), 'NEW_SEX_CAT'] = 'seniormale'

In [92]:
df.loc[(df['Sex'] == 'female') & (df['Age'] <= 21), 'NEW_SEX_CAT'] = 'youngfemale'

In [93]:
df.loc[(df['Sex'] == 'female') & (df['Age'] > 21) & (df['Age'] < 50), 'NEW_SEX_CAT'] = 'maturefemale'


In [94]:
df.loc[(df['Sex'] == 'female') & (df['Age'] >= 50), 'NEW_SEX_CAT'] = 'seniorfemale'


In [95]:
df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,NEW_AGE_PCLASS,NEW_FAMILY_SIZE,NEW_SEX_CAT
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,66.0,2,maturemale
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,38.0,2,maturefemale
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,78.0,1,maturefemale
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,35.0,2,maturefemale
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,105.0,1,maturemale


In [96]:
df.groupby("NEW_SEX_CAT")["Survived"].mean()

NEW_SEX_CAT
maturefemale    0.774194
maturemale      0.199288
seniorfemale    0.909091
seniormale      0.134615
youngfemale     0.678571
youngmale       0.250000
Name: Survived, dtype: float64

In [97]:
# YENİ OLUŞTURDUGUMUZ KOLON SAYESİNDE COK GÜZEL Bİ CIKTI ELDE ETTİK !! :)