In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, TimeSeriesSplit
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn import model_selection, linear_model, metrics, pipeline, preprocessing
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns


In [8]:
data = pd.read_csv('weatherAUS.csv')
print(data.shape)
data.head()

(142193, 24)


Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No


In [9]:
data.dtypes

Date              object
Location          object
MinTemp          float64
MaxTemp          float64
Rainfall         float64
Evaporation      float64
Sunshine         float64
WindGustDir       object
WindGustSpeed    float64
WindDir9am        object
WindDir3pm        object
WindSpeed9am     float64
WindSpeed3pm     float64
Humidity9am      float64
Humidity3pm      float64
Pressure9am      float64
Pressure3pm      float64
Cloud9am         float64
Cloud3pm         float64
Temp9am          float64
Temp3pm          float64
RainToday         object
RISK_MM          float64
RainTomorrow      object
dtype: object

In [10]:
data['Date'] = pd.to_datetime(data['Date'], format='%Y-%m-%d')
y = data['RainTomorrow']
y = [1 if item == 'Yes' else 0 for item in y]
data = data.drop(columns=['RainTomorrow'])
data['RainToday'] = data['RainToday'].apply(lambda item: 1 if item == 'Yes' else 0)

In [11]:
cat_features = list(data.dtypes[(data.dtypes == 'object')].keys())
num_features = [col for col in data.columns if col not in set(cat_features) and col != 'Date']
print(cat_features)
print(num_features)

['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm']
['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RainToday', 'RISK_MM']


In [12]:
# sns.heatmap(data[num_features], annot=True)

In [13]:
#теперь обработаем пропуски
# для начала посчитаем проценты NaN в признаках
missings = [] # find missing persentage in columns
for col in data.columns:
    pct_missing = np.mean(data[col].isnull())
    missings.append((col, pct_missing*100))
#     print('{} - {}%'.format(col, round(pct_missing*100)))
missings.sort(key=lambda item: item[1])

In [14]:
missings

[('Date', 0.0),
 ('Location', 0.0),
 ('RainToday', 0.0),
 ('RISK_MM', 0.0),
 ('MaxTemp', 0.2264527789694289),
 ('MinTemp', 0.44798267143952236),
 ('Temp9am', 0.635755627914173),
 ('WindSpeed9am', 0.948007285872019),
 ('Rainfall', 0.9887969168665124),
 ('Humidity9am', 1.247600092831574),
 ('WindSpeed3pm', 1.8495987847503041),
 ('Temp3pm', 1.9171126567411898),
 ('Humidity3pm', 2.5388028946572616),
 ('WindDir3pm', 2.6569521706413113),
 ('WindGustSpeed', 6.5193082641198945),
 ('WindGustDir', 6.561504434114197),
 ('WindDir9am', 7.041837502549352),
 ('Pressure3pm', 9.832410878172624),
 ('Pressure9am', 9.85561877166949),
 ('Cloud9am', 37.73533155640573),
 ('Cloud3pm', 40.15246882757942),
 ('Evaporation', 42.78902618272348),
 ('Sunshine', 47.692924405561456)]

In [15]:
columns_to_fill = [item[0] for item in missings if item[1] > 0 ] #признаки с пропущенными переменными
columns_to_fill

['MaxTemp',
 'MinTemp',
 'Temp9am',
 'WindSpeed9am',
 'Rainfall',
 'Humidity9am',
 'WindSpeed3pm',
 'Temp3pm',
 'Humidity3pm',
 'WindDir3pm',
 'WindGustSpeed',
 'WindGustDir',
 'WindDir9am',
 'Pressure3pm',
 'Pressure9am',
 'Cloud9am',
 'Cloud3pm',
 'Evaporation',
 'Sunshine']

In [16]:
for col in columns_to_fill:
    if col in cat_features:
        top = data[col].value_counts().argmax() # impute with the most frequent value.
        data[col] = data[col].fillna(top)
    else:
        mean = data[col].mean()
        data[col] = data[col].fillna(mean) # impute with the mean

In [17]:
data['Date'] = pd.to_datetime(data['Date'], format='%Y-%m-%d')
data['year'] = data.Date.apply(lambda x : x.year)
data['month'] = data.Date.apply(lambda x : x.month)
data['day'] = data.Date.apply(lambda x : x.day)
data = data.drop(columns=['Date'])

In [18]:
data.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,year,month,day
0,Albury,13.4,22.9,0.6,5.469824,7.624853,W,44.0,W,WNW,...,1007.1,8.0,4.503167,16.9,21.8,0,0.0,2008,12,1
1,Albury,7.4,25.1,0.0,5.469824,7.624853,WNW,44.0,NNW,WSW,...,1007.8,4.437189,4.503167,17.2,24.3,0,0.0,2008,12,2
2,Albury,12.9,25.7,0.0,5.469824,7.624853,WSW,46.0,W,WSW,...,1008.7,4.437189,2.0,21.0,23.2,0,0.0,2008,12,3
3,Albury,9.2,28.0,0.0,5.469824,7.624853,NE,24.0,SE,E,...,1012.8,4.437189,4.503167,18.1,26.5,0,1.0,2008,12,4
4,Albury,17.5,32.3,1.0,5.469824,7.624853,W,41.0,ENE,NW,...,1006.0,7.0,8.0,17.8,29.7,0,0.2,2008,12,5


In [19]:
X = data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

In [20]:
binary_data_columns = ['RainToday']
binary_data_indices = np.array([(column in binary_data_columns) for column in X_train.columns], dtype = bool)

In [21]:
print(binary_data_columns)
print(binary_data_indices)

['RainToday']
[False False False False False False False False False False False False
 False False False False False False False False  True False False False
 False]


In [22]:
categorical_data_columns = cat_features
categorical_data_indices = np.array([(column in categorical_data_columns) for column in X_train.columns], dtype = bool)

In [23]:
print(categorical_data_columns)
print(categorical_data_indices)

['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm']
[ True False False False False False  True False  True  True False False
 False False False False False False False False False False False False
 False]


In [24]:
numeric_data_columns = num_features
numeric_data_indices = np.array([(column in numeric_data_columns) for column in X_train.columns], dtype = bool)

In [25]:
print(numeric_data_columns)
print(numeric_data_indices)

['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RainToday', 'RISK_MM']
[False  True  True  True  True  True False  True False False  True  True
  True  True  True  True  True  True  True  True  True  True False False
 False]


In [26]:
regressor = LogisticRegression()

In [27]:
X_train.iloc[:, categorical_data_indices].dtypes

Location       object
WindGustDir    object
WindDir9am     object
WindDir3pm     object
dtype: object

In [28]:
estimator = pipeline.Pipeline(steps = [       
    ('feature_processing', pipeline.FeatureUnion(transformer_list = [        
            #binary
            ('binary_variables_processing', preprocessing.FunctionTransformer(lambda data: data.iloc[:, binary_data_indices])), 
                    
            #numeric
            ('numeric_variables_processing', pipeline.Pipeline(steps = [
                ('selecting', preprocessing.FunctionTransformer(lambda data: data.iloc[:, numeric_data_indices])),
                ('scaling', preprocessing.StandardScaler())            
                        ])),
        
            #categorical
            ('categorical_variables_processing', pipeline.Pipeline(steps = [
                ('selecting', preprocessing.FunctionTransformer(lambda data: data.iloc[:, categorical_data_indices].astype(str))),
                ('hot_encoding', preprocessing.OneHotEncoder(handle_unknown = 'ignore'))            
                        ])),
        ])),
    ('model_fitting', regressor)
    ]
)

In [29]:
estimator.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(steps=[('feature_processing',
                 FeatureUnion(transformer_list=[('binary_variables_processing',
                                                 FunctionTransformer(func=<function <lambda> at 0x185307950>)),
                                                ('numeric_variables_processing',
                                                 Pipeline(steps=[('selecting',
                                                                  FunctionTransformer(func=<function <lambda> at 0x185307440>)),
                                                                 ('scaling',
                                                                  StandardScaler())])),
                                                ('categorical_variables_processing',
                                                 Pipeline(steps=[('selecting',
                                                                  FunctionTransformer(func=<function <lambda> at 0x185307680>)),
                      

In [30]:
metrics.roc_auc_score(y_test, estimator.predict(X_test))

0.997391417764445

In [31]:
bayes = GaussianNB()

In [32]:
estimator = pipeline.Pipeline(steps = [       
    ('feature_processing', pipeline.FeatureUnion(transformer_list = [        
            #binary
            ('binary_variables_processing', preprocessing.FunctionTransformer(lambda data: data.iloc[:, binary_data_indices])), 
                    
            #numeric
            ('numeric_variables_processing', pipeline.Pipeline(steps = [
                ('selecting', preprocessing.FunctionTransformer(lambda data: data.iloc[:, numeric_data_indices])),
                ('scaling', preprocessing.StandardScaler())            
                        ])),
        
            #categorical
            ('categorical_variables_processing', pipeline.Pipeline(steps = [
                ('selecting', preprocessing.FunctionTransformer(lambda data: data.iloc[:, categorical_data_indices].astype(str))),
                ('hot_encoding', preprocessing.OneHotEncoder(handle_unknown = 'ignore', sparse=False))            
                        ])),
        ])),
    ('model_fitting', bayes)
    ]
)

In [33]:
estimator.fit(X_train, y_train)

Pipeline(steps=[('feature_processing',
                 FeatureUnion(transformer_list=[('binary_variables_processing',
                                                 FunctionTransformer(func=<function <lambda> at 0x18541eb90>)),
                                                ('numeric_variables_processing',
                                                 Pipeline(steps=[('selecting',
                                                                  FunctionTransformer(func=<function <lambda> at 0x18541ec20>)),
                                                                 ('scaling',
                                                                  StandardScaler())])),
                                                ('categorical_variables_processing',
                                                 Pipeline(steps=[('selecting',
                                                                  FunctionTransformer(func=<function <lambda> at 0x18541ecb0>)),
                      

In [34]:
metrics.roc_auc_score(y_test, estimator.predict(X_test))

0.925098275823694

In [35]:
knn = KNeighborsClassifier()

In [36]:
estimator = pipeline.Pipeline(steps = [       
    ('feature_processing', pipeline.FeatureUnion(transformer_list = [        
            #binary
            ('binary_variables_processing', preprocessing.FunctionTransformer(lambda data: data.iloc[:, binary_data_indices])), 
                    
            #numeric
            ('numeric_variables_processing', pipeline.Pipeline(steps = [
                ('selecting', preprocessing.FunctionTransformer(lambda data: data.iloc[:, numeric_data_indices])),
                ('scaling', preprocessing.StandardScaler())            
                        ])),
        
            #categorical
            ('categorical_variables_processing', pipeline.Pipeline(steps = [
                ('selecting', preprocessing.FunctionTransformer(lambda data: data.iloc[:, categorical_data_indices].astype(str))),
                ('hot_encoding', preprocessing.OneHotEncoder(handle_unknown = 'ignore', sparse=False))            
                        ])),
        ])),
    ('model_fitting', knn)
    ]
)

In [37]:
estimator.fit(X_train, y_train)

Pipeline(steps=[('feature_processing',
                 FeatureUnion(transformer_list=[('binary_variables_processing',
                                                 FunctionTransformer(func=<function <lambda> at 0x18541edd0>)),
                                                ('numeric_variables_processing',
                                                 Pipeline(steps=[('selecting',
                                                                  FunctionTransformer(func=<function <lambda> at 0x18541ef80>)),
                                                                 ('scaling',
                                                                  StandardScaler())])),
                                                ('categorical_variables_processing',
                                                 Pipeline(steps=[('selecting',
                                                                  FunctionTransformer(func=<function <lambda> at 0x185408050>)),
                      

In [38]:
%%time
metrics.roc_auc_score(y_test, estimator.predict(X_test))

CPU times: user 11min 47s, sys: 6.27 s, total: 11min 53s
Wall time: 14min 55s


0.7478490774350648

#### Наибольшую точность для данной задачи имеет логистическая регрессия. Наименьшую - метод ближайших соседей (по умолчанию k = 5). Дольше всего выполняется метод KNN.