In [1]:
# ライブラリの読み込み
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_sub = pd.read_csv("sample_submit.csv", header=None)

In [3]:
print(train.shape)
print(train.dtypes)

(55583, 29)
id                          int64
accommodates                int64
amenities                  object
bathrooms                 float64
bed_type                   object
bedrooms                  float64
beds                      float64
cancellation_policy        object
city                       object
cleaning_fee               object
description                object
first_review               object
host_has_profile_pic       object
host_identity_verified     object
host_response_rate         object
host_since                 object
instant_bookable           object
last_review                object
latitude                  float64
longitude                 float64
name                       object
neighbourhood              object
number_of_reviews           int64
property_type              object
review_scores_rating      float64
room_type                  object
thumbnail_url              object
zipcode                    object
y                         float64
dt

In [4]:
train["host_response_rate"].value_counts()

100%    32510
90%      1666
80%       824
0%        660
50%       464
        ...  
6%          1
31%         1
21%         1
15%         1
39%         1
Name: host_response_rate, Length: 78, dtype: int64

In [5]:
train.isnull().sum()

id                            0
accommodates                  0
amenities                     0
bathrooms                   147
bed_type                      0
bedrooms                     71
beds                         96
cancellation_policy           0
city                          0
cleaning_fee                  0
description                   0
first_review              11908
host_has_profile_pic        148
host_identity_verified      148
host_response_rate        13704
host_since                  148
instant_bookable              0
last_review               11880
latitude                      0
longitude                     0
name                          0
neighbourhood              5160
number_of_reviews             0
property_type                 0
review_scores_rating      12556
room_type                     0
thumbnail_url              6145
zipcode                     716
y                             0
dtype: int64

In [6]:
test.isnull().sum()

id                           0
accommodates                 0
amenities                    0
bathrooms                   53
bed_type                     0
bedrooms                    20
beds                        35
cancellation_policy          0
city                         0
cleaning_fee                 0
description                  0
first_review              3956
host_has_profile_pic        40
host_identity_verified      40
host_response_rate        4595
host_since                  40
instant_bookable             0
last_review               3947
latitude                     0
longitude                    0
name                         0
neighbourhood             1712
number_of_reviews            0
property_type                0
review_scores_rating      4166
room_type                    0
thumbnail_url             2071
zipcode                    250
dtype: int64

In [7]:
train["bed_type"].value_counts()

Real Bed         53989
Futon              569
Pull-out Sofa      453
Airbed             364
Couch              208
Name: bed_type, dtype: int64

In [14]:
train["neighbourhood"]

0                      NaN
1                Brookland
2                 Bushwick
3                 Nob Hill
4          Upper West Side
               ...        
55578    Flatiron District
55579                  NaN
55580     Mission District
55581               Reseda
55582         Sherman Oaks
Name: neighbourhood, Length: 55583, dtype: object

# 説明変数としてありうるもの  
["accommodates","bathrooms","bed_type","bedrooms","beds","city","cleaning_fee","latitude","longitude"]

In [8]:
train["city"].value_counts()

NYC        24326
LA         16828
SF          4768
DC          4259
Chicago     2807
Boston      2595
Name: city, dtype: int64

In [9]:
train_filter = train[["accommodates","bathrooms","bed_type","bedrooms","beds","city","cleaning_fee","latitude","longitude"]]
test_filter = test[["accommodates","bathrooms","bed_type","bedrooms","beds","city","cleaning_fee","latitude","longitude"]]

In [10]:
#欠損値処理
items = ["bathrooms","bedrooms","beds"]
for item in items:
    nan_value = (train_filter[item].sum() + test_filter[item].sum())/(len(train_filter) + len(test_filter))
    train_filter[item] =train_filter[item].fillna(nan_value)
    test_filter[item] =test_filter[item].fillna(nan_value)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_filter[item] =train_filter[item].fillna(nan_value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_filter[item] =test_filter[item].fillna(nan_value)


In [31]:
train_filter.isnull().sum()

accommodates    0
bathrooms       0
bed_type        0
bedrooms        0
beds            0
city            0
cleaning_fee    0
latitude        0
longitude       0
dtype: int64

In [16]:
train_filter.dtypes

accommodates      int64
bathrooms       float64
bed_type          int64
bedrooms        float64
beds            float64
city              int64
cleaning_fee      int64
latitude        float64
longitude       float64
dtype: object

In [14]:
train["cleaning_fee"].value_counts()

t    40821
f    14762
Name: cleaning_fee, dtype: int64

In [15]:
#ラベルエンコーディング
object_columns = ["bed_type","city","cleaning_fee"]
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

for column in object_columns:
    train_filter[column] = encoder.fit_transform(train_filter[column])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_filter[column] = encoder.fit_transform(train_filter[column])


In [24]:
#2変数のVIFを計測

def VIF(corr):
    return 1/(1-corr**2)

correlation_matrix = train_filter.corr()

columns = ["accommodates","bathrooms","bed_type","bedrooms","beds","city","cleaning_fee","latitude","longitude"]
for column1 in columns:
    for column2 in columns:
        if(column1!=column2):
            if VIF(correlation_matrix[column1][column2])>1:
                print("{} and {} のVIF:{}".format(column1,column2,str(VIF(correlation_matrix[column1][column2]))))
                
correlation_matrix

accommodates and bathrooms のVIF:1.3364763601846108
accommodates and bed_type のVIF:1.0056681412797344
accommodates and bedrooms のVIF:2.0070948448582233
accommodates and beds のVIF:2.8506685219523566
accommodates and city のVIF:1.0104764751475357
accommodates and cleaning_fee のVIF:1.0340182782770377
accommodates and latitude のVIF:1.006343591721376
accommodates and longitude のVIF:1.0075149992551422
bathrooms and accommodates のVIF:1.3364763601846108
bathrooms and bed_type のVIF:1.001681264576713
bathrooms and bedrooms のVIF:1.5231367040913768
bathrooms and beds のVIF:1.3795901821440135
bathrooms and city のVIF:1.0062903703650548
bathrooms and cleaning_fee のVIF:1.0027726661259924
bathrooms and latitude のVIF:1.0186309232513284
bathrooms and longitude のVIF:1.0176939176953825
bed_type and accommodates のVIF:1.0056681412797344
bed_type and bathrooms のVIF:1.001681264576713
bed_type and bedrooms のVIF:1.0030755435097698
bed_type and beds のVIF:1.0045787759891693
bed_type and city のVIF:1.0000168135041088
b

Unnamed: 0,accommodates,bathrooms,bed_type,bedrooms,beds,city,cleaning_fee,latitude,longitude
accommodates,1.0,0.501761,0.075075,0.708355,0.805733,-0.101823,0.181381,-0.079395,-0.086365
bathrooms,0.501761,1.0,0.040969,0.586055,0.524545,-0.079064,0.052583,-0.135241,-0.131857
bed_type,0.075075,0.040969,1.0,0.055372,0.067512,0.0041,0.037313,0.002161,0.004382
bedrooms,0.708355,0.586055,0.055372,1.0,0.70789,-0.067522,0.106615,-0.057467,-0.073981
beds,0.805733,0.524545,0.067512,0.70789,1.0,-0.068174,0.131655,-0.080239,-0.08236
city,-0.101823,-0.079064,0.0041,-0.067522,-0.068174,1.0,-0.01823,-0.013491,-0.067891
cleaning_fee,0.181381,0.052583,0.037313,0.106615,0.131655,-0.01823,1.0,-0.061194,-0.068876
latitude,-0.079395,-0.135241,0.002161,-0.057467,-0.080239,-0.013491,-0.061194,1.0,0.896581
longitude,-0.086365,-0.131857,0.004382,-0.073981,-0.08236,-0.067891,-0.068876,0.896581,1.0
