In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
 
from matplotlib import pyplot as plt
 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('propuski_final_data.csv', delimiter=';')
data.head(5)

Unnamed: 0.1,Unnamed: 0,address,z_address,bathrooms,bedrooms,finishedsqft,lastsolddate,lastsoldprice,latitude,longitude,neighborhood,totalrooms,usecode,yearbuilt,zestimate,zindexvalue,zipcode,zpid
0,2,Address: 1160 Mission Street #2007,1160 Mission St UNIT 2007,2.0,2.0,1043.0,02/17/2016,1300000.0,37.778705,-122.412635,South of Market,4.0,Condominium,2007.0,1167508.0,975700,94103.0,83152781.0
1,5,Address: 260 King Street #475,260 King St UNIT 475,1.0,1.0,903.0,02/17/2016,750000.0,37.777641,-122.393417,South of Market,3.0,Condominium,2004.0,823719.0,975700,94107.0,69819817.0
2,7,Address: 560 Missouri Street #B,560 Missouri St # B,4.0,3.0,1425.0,02/17/2016,1495000.0,37.759198,-122.396516,Potrero Hill,6.0,Condominium,2003.0,1708594.0,1277600,94107.0,64972847.0
3,9,Address: 350 Missouri Street,350 Missouri St,3.0,3.0,2231.0,02/17/2016,2700000.0,37.761886,-122.396769,Potrero Hill,10.0,SingleFamily,1927.0,2411236.0,1277600,94107.0,15149489.0
4,11,Address: 3658 Folsom Street,3658 Folsom St,3.0,3.0,1300.0,02/17/2016,1530000.0,37.740795,-122.413453,Bernal Heights,4.0,SingleFamily,1900.0,1918539.0,1248000,94110.0,15161978.0


In [9]:
data.shape

(11330, 18)

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11330 entries, 0 to 11329
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     11330 non-null  int64  
 1   address        11330 non-null  object 
 2   z_address      11330 non-null  object 
 3   bathrooms      11330 non-null  float64
 4   bedrooms       11330 non-null  float64
 5   finishedsqft   11211 non-null  float64
 6   lastsolddate   11330 non-null  object 
 7   lastsoldprice  11330 non-null  float64
 8   latitude       11315 non-null  float64
 9   longitude      11315 non-null  float64
 10  neighborhood   11330 non-null  object 
 11  totalrooms     11281 non-null  float64
 12  usecode        11290 non-null  object 
 13  yearbuilt      11330 non-null  float64
 14  zestimate      11330 non-null  float64
 15  zindexvalue    11330 non-null  object 
 16  zipcode        11330 non-null  float64
 17  zpid           11330 non-null  float64
dtypes: flo

Будем использовать линейную регрессию для тестирования качества данных

In [11]:
def get_score(X,y, random_seed=42, model=None):
  if model is None:
    model = LinearRegression()
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_seed )  
  model.fit(X_train, y_train)
  return model.score(X_test, y_test)

Для поиска выбросов будем использовать boxplot

In [12]:
def get_boxplot(X, columns=[]):
  for i in columns:
    sns.boxplot(x=X[i])
  pass  

def get_pairplot(X, columns=None):
  if columns is None:
    columns = list(X.columns)
  sns.pairplot(X[columns])
  pass    

Для заполнения пропущенных значений будем использовать стандартные способы
- заполнение значением
- max
- min
- mode
- median
- mean
- метод индикатора
- линейная регрессия

In [13]:
def get_value(X, column, mode='mean', value=0, columns_for_reg=None):
  if mode == 'value':
    X.loc[X[X[column].isna()].index ,column] = value
  elif mode == 'max':
    X.loc[X[X[column].isna()].index ,column] = X[column].max()
  elif mode == 'min':
    X.loc[X[X[column].isna()].index ,column] = X[column].min()
  elif mode == 'median':
    X.loc[X[X[column].isna()].index ,column] = X[column].median()    
  elif mode == 'mode':
    X.loc[X[X[column].isna()].index, column] = X[column].mode()[0]  
  elif mode == 'indicator':
    X.loc[X[X[column].isna()].index, column] = 0
    X['ind_'+str(column)] = 0
    X.loc[X[X[column].isna()].index, 'ind_'+str(column)] = 1
  elif mode == 'linreg':
    if columns_for_reg is None:
      cols = list(X.select_dtypes([np.number]).columns) 
      cols.remove(column)
    else:
      cols = columns_for_reg
    X_tmp = X.dropna()  
    m = LinearRegression().fit(X_tmp[cols], X_tmp[column])
    X.loc[X[X[column].isna()].index, column] = m.predict(X[X[column].isna()][cols])
  else:
    X.loc[X[X[column].isna()].index, column] = X[column].mean()
  return X  


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11330 entries, 0 to 11329
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     11330 non-null  int64  
 1   address        11330 non-null  object 
 2   z_address      11330 non-null  object 
 3   bathrooms      11330 non-null  float64
 4   bedrooms       11330 non-null  float64
 5   finishedsqft   11211 non-null  float64
 6   lastsolddate   11330 non-null  object 
 7   lastsoldprice  11330 non-null  float64
 8   latitude       11315 non-null  float64
 9   longitude      11315 non-null  float64
 10  neighborhood   11330 non-null  object 
 11  totalrooms     11281 non-null  float64
 12  usecode        11290 non-null  object 
 13  yearbuilt      11330 non-null  float64
 14  zestimate      11330 non-null  float64
 15  zindexvalue    11330 non-null  object 
 16  zipcode        11330 non-null  float64
 17  zpid           11330 non-null  float64
dtypes: flo

In [15]:
data.describe()

Unnamed: 0.1,Unnamed: 0,bathrooms,bedrooms,finishedsqft,lastsoldprice,latitude,longitude,totalrooms,yearbuilt,zestimate,zipcode,zpid
count,11330.0,11330.0,11330.0,11211.0,11330.0,11315.0,11315.0,11281.0,11330.0,11330.0,11330.0,11330.0
mean,9171.729214,1.980229,2.614475,1586.081349,1263928.0,37.759715,-122.436508,6.113997,1948.498147,1565695.0,94116.912004,36899730.0
std,4921.941074,1.047358,1.299457,922.815803,1042079.0,0.025583,0.030742,12.151195,37.911196,1229417.0,9.400877,78007410.0
min,2.0,0.5,0.0,1.0,535.0,37.70817,-122.510726,1.0,1860.0,432385.0,94102.0,15063290.0
25%,5039.75,1.0,2.0,1019.0,729250.0,37.739284,-122.455149,4.0,1916.0,905237.5,94110.0,15108470.0
50%,9198.5,2.0,2.0,1362.0,990000.0,37.760514,-122.43251,5.0,1940.0,1230758.0,94115.0,15156970.0
75%,13374.75,2.0,3.0,1878.0,1450000.0,37.781386,-122.413353,7.0,1986.0,1731170.0,94123.0,59700400.0
max,17632.0,14.0,20.0,27275.0,23889000.0,37.806083,-122.381201,1264.0,2016.0,15533250.0,94158.0,2146999000.0


In [16]:
data.columns

Index(['Unnamed: 0', 'address', 'z_address', 'bathrooms', 'bedrooms',
       'finishedsqft', 'lastsolddate', 'lastsoldprice', 'latitude',
       'longitude', 'neighborhood', 'totalrooms', 'usecode', 'yearbuilt',
       'zestimate', 'zindexvalue', 'zipcode', 'zpid'],
      dtype='object')

In [17]:
cols = ['Unnamed: 0', 'bathrooms', 'bedrooms',
       'finishedsqft', 'lastsoldprice', 'latitude',
       'longitude', 'totalrooms','yearbuilt','zipcode']
cols2 = ['Unnamed: 0', 'bathrooms', 'bedrooms',
       'finishedsqft', 'latitude',
       'longitude', 'totalrooms','yearbuilt','zipcode']

In [18]:
data.dropna().shape

(11108, 18)

In [19]:
-             = 0.8375491685999106, 0.607156739569225
finishedsqft  = 0.8524615199933394, 0.5577844171230759
long and lat  = 0.8517111453083177, 0.6874289038247563
totalrooms    = 0.8598254992362179, 0.6426676296618906

SyntaxError: invalid syntax (<ipython-input-19-f98496e9d777>, line 1)

In [20]:
get_score(data.dropna()[cols], data.dropna()['zestimate'])

0.8375491685999525

In [21]:
get_score(data.dropna()[cols2], data.dropna()['zestimate'])

0.6071567395692319

In [22]:
for_drop = ['Unnamed: 0', 'address', 'zindexvalue', 'zpid']
target = ['zestimate']

In [23]:

_ = get_value(data, 'finishedsqft', mode='linreg', value=0, columns_for_reg=['bathrooms', 'bedrooms', 'lastsoldprice','totalrooms'])

In [24]:
data['neighborhood'].value_counts()

Mission             540
Bernal Heights      525
South of Market     524
South Beach         461
Pacific Heights     439
                   ... 
North Beach          31
Lakeside             29
Little Hollywood     12
North Waterfront      8
Daly City             3
Name: neighborhood, Length: 71, dtype: int64

In [25]:
for i, el in data[data['latitude'].isna()].iterrows():
  data.loc[i, 'longitude'] = data[data['neighborhood'] == el['neighborhood']]['longitude'].mean()
  data.loc[i, 'latitude'] = data[data['neighborhood'] == el['neighborhood']]['latitude'].mean()


In [26]:
data[data.totalrooms - (data.bedrooms + data.bathrooms) < 0]

Unnamed: 0.1,Unnamed: 0,address,z_address,bathrooms,bedrooms,finishedsqft,lastsolddate,lastsoldprice,latitude,longitude,neighborhood,totalrooms,usecode,yearbuilt,zestimate,zindexvalue,zipcode,zpid
2,7,Address: 560 Missouri Street #B,560 Missouri St # B,4.00,3.0,1425.0,02/17/2016,1495000.0,37.759198,-122.396516,Potrero Hill,6.0,Condominium,2003.0,1708594.0,1277600,94107.0,64972847.0
4,11,Address: 3658 Folsom Street,3658 Folsom St,3.00,3.0,1300.0,02/17/2016,1530000.0,37.740795,-122.413453,Bernal Heights,4.0,SingleFamily,1900.0,1918539.0,1248000,94110.0,15161978.0
39,74,Address: 2455 14th Avenue,2455 14th Ave,3.75,4.0,1846.0,02/10/2016,1525000.0,37.742271,-122.470518,West Portal,5.0,SingleFamily,1932.0,1195679.0,1565000,94116.0,15122857.0
46,93,Address: 784 Spruce Street,784-786 Spruce St,3.00,8.0,2600.0,02/09/2016,1535000.0,37.780606,-122.452663,Lone Mountain,10.0,MultiFamily2To4,1927.0,1524759.0,1447700,94118.0,15083824.0
60,121,Address: 2017 Buchanan Street,2017 Buchanan St,4.00,4.0,2850.0,02/05/2016,2800000.0,37.788913,-122.430728,Lower Pacific Heights,7.0,Condominium,1890.0,3032479.0,1178700,94115.0,21699068.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11255,17466,Address: 560 Missouri Street #B,560 Missouri St # B,4.00,3.0,1425.0,02/17/2016,1495000.0,37.759198,-122.396516,Potrero Hill,6.0,Condominium,2003.0,1685676.0,1264000,94107.0,64972847.0
11257,17470,Address: 3658 Folsom Street,3658 Folsom St,3.00,3.0,1300.0,02/17/2016,1530000.0,37.740795,-122.413453,Bernal Heights,4.0,SingleFamily,1900.0,1899916.0,1214400,94110.0,15161978.0
11292,17533,Address: 2455 14th Avenue,2455 14th Ave,3.75,4.0,1846.0,02/10/2016,1525000.0,37.742271,-122.470518,West Portal,5.0,SingleFamily,1932.0,1228368.0,1551600,94116.0,15122857.0
11299,17552,Address: 784 Spruce Street,784-786 Spruce St,3.00,8.0,2600.0,02/09/2016,1535000.0,37.780606,-122.452663,Lone Mountain,10.0,MultiFamily2To4,1927.0,1760133.0,1419600,94118.0,15083824.0


In [27]:
get_score(data.dropna()[['finishedsqft']], data.dropna()['zestimate'])

0.6064430886926846

In [28]:

_ = get_value(data, 'totalrooms', mode='linreg', value=0, columns_for_reg=['bathrooms', 'bedrooms', 'lastsoldprice','finishedsqft'])

In [29]:
data['totalrooms'] = data['totalrooms'].apply(lambda x: round(x))

In [30]:
data['usecode'].value_counts(dropna=False) 

SingleFamily        5785
Condominium         4782
MultiFamily2To4      484
Duplex               146
Townhouse             66
NaN                   40
Miscellaneous         17
Apartment              3
Cooperative            3
MultiFamily5Plus       2
Mobile                 2
Name: usecode, dtype: int64

In [31]:
_ = get_value(data, 'usecode', mode='value', value='other')

In [32]:
data.head(2)

Unnamed: 0.1,Unnamed: 0,address,z_address,bathrooms,bedrooms,finishedsqft,lastsolddate,lastsoldprice,latitude,longitude,neighborhood,totalrooms,usecode,yearbuilt,zestimate,zindexvalue,zipcode,zpid
0,2,Address: 1160 Mission Street #2007,1160 Mission St UNIT 2007,2.0,2.0,1043.0,02/17/2016,1300000.0,37.778705,-122.412635,South of Market,4,Condominium,2007.0,1167508.0,975700,94103.0,83152781.0
1,5,Address: 260 King Street #475,260 King St UNIT 475,1.0,1.0,903.0,02/17/2016,750000.0,37.777641,-122.393417,South of Market,3,Condominium,2004.0,823719.0,975700,94107.0,69819817.0


In [33]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11330 entries, 0 to 11329
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     11330 non-null  int64  
 1   address        11330 non-null  object 
 2   z_address      11330 non-null  object 
 3   bathrooms      11330 non-null  float64
 4   bedrooms       11330 non-null  float64
 5   finishedsqft   11330 non-null  float64
 6   lastsolddate   11330 non-null  object 
 7   lastsoldprice  11330 non-null  float64
 8   latitude       11330 non-null  float64
 9   longitude      11330 non-null  float64
 10  neighborhood   11330 non-null  object 
 11  totalrooms     11330 non-null  int64  
 12  usecode        11330 non-null  object 
 13  yearbuilt      11330 non-null  float64
 14  zestimate      11330 non-null  float64
 15  zindexvalue    11330 non-null  object 
 16  zipcode        11330 non-null  float64
 17  zpid           11330 non-null  float64
dtypes: flo

In [34]:
data.to_csv('new_data.csv')

 ## Вторая часть

In [35]:
data = pd.read_csv('new_data.csv')

In [36]:
def get_one_hot(X, cols):
    for each in cols:
        dummies = pd.get_dummies(X[each], prefix=each, drop_first=False)
        X = pd.concat([X, dummies], axis=1)
    return X

def get_woe_v1(X, col, target_col):
    all_one = X[col].value_counts() / X[col].value_counts()
    all_good = len(X[X[target_col] == 1][col])
    all_bad = len(X[X[target_col] == 0][col])
    odds_series = (
        (((all_one*X[df_train[target_col] == 1][col].value_counts()).fillna(0)+0.5)/all_good)
        /
        (((all_one*X[X[target_col] == 0][col].value_counts()).fillna(0)+0.5)/all_bad)
    )
    category_woe_dict = np.log(odds_series).to_dict()
    X[col + '_woe'] = X[col].apply(category_woe_dict.get)
    return X

def get_woe_cols(X, cols, target_col):
    for col in cols:
        X = get_woe_v1(X, col, target_col)
    return X    

