In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
weather = pd.read_csv("weatherAUS5000.csv")

In [3]:
weather.head()

Unnamed: 0.1,Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainTomorrow
0,0,2015-03-24,Adelaide,12.3,19.3,0.0,5.0,,S,39.0,...,19.0,59.0,47.0,1022.2,1021.4,,,15.1,17.7,No
1,1,2011-07-12,Adelaide,7.9,11.4,0.0,1.0,0.5,N,20.0,...,7.0,70.0,59.0,1028.7,1025.7,,,8.4,11.3,No
2,2,2010-02-08,Adelaide,24.0,38.1,0.0,23.4,13.0,SE,39.0,...,19.0,36.0,24.0,1018.0,1016.0,,,32.4,37.4,No
3,3,2016-09-19,Adelaide,6.7,16.4,0.4,,,N,31.0,...,15.0,65.0,40.0,1014.4,1010.0,,,11.2,15.9,No
4,4,2014-03-05,Adelaide,16.7,24.8,0.0,6.6,11.7,S,37.0,...,24.0,61.0,48.0,1019.3,1018.9,,,20.8,23.7,No


In [4]:
# 分开特征矩阵和标签
X = weather.iloc[:, :-1]
Y = weather.iloc[:,-1]

In [5]:
X.shape

(5000, 22)

In [6]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     5000 non-null   int64  
 1   Date           5000 non-null   object 
 2   Location       5000 non-null   object 
 3   MinTemp        4979 non-null   float64
 4   MaxTemp        4987 non-null   float64
 5   Rainfall       4950 non-null   float64
 6   Evaporation    2841 non-null   float64
 7   Sunshine       2571 non-null   float64
 8   WindGustDir    4669 non-null   object 
 9   WindGustSpeed  4669 non-null   float64
 10  WindDir9am     4651 non-null   object 
 11  WindDir3pm     4887 non-null   object 
 12  WindSpeed9am   4949 non-null   float64
 13  WindSpeed3pm   4919 non-null   float64
 14  Humidity9am    4936 non-null   float64
 15  Humidity3pm    4880 non-null   float64
 16  Pressure9am    4506 non-null   float64
 17  Pressure3pm    4504 non-null   float64
 18  Cloud9am

In [7]:
# 探索缺失值
X.isnull().mean()

Unnamed: 0       0.0000
Date             0.0000
Location         0.0000
MinTemp          0.0042
MaxTemp          0.0026
Rainfall         0.0100
Evaporation      0.4318
Sunshine         0.4858
WindGustDir      0.0662
WindGustSpeed    0.0662
WindDir9am       0.0698
WindDir3pm       0.0226
WindSpeed9am     0.0102
WindSpeed3pm     0.0162
Humidity9am      0.0128
Humidity3pm      0.0240
Pressure9am      0.0988
Pressure3pm      0.0992
Cloud9am         0.3778
Cloud3pm         0.3976
Temp9am          0.0066
Temp3pm          0.0176
dtype: float64

In [8]:
# 分为训练集和测试集
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3, random_state=420)

In [9]:
# 恢复索引
for i in [Xtrain, Xtest, Ytrain, Ytest]:
    i.index = range(i.shape[0])

In [10]:
Ytrain

0        No
1        No
2        No
3       Yes
4        No
       ... 
3495     No
3496    Yes
3497     No
3498     No
3499     No
Name: RainTomorrow, Length: 3500, dtype: object

In [11]:
# 是否不均衡
Ytrain.value_counts()

No     2704
Yes     796
Name: RainTomorrow, dtype: int64

In [12]:
Ytest.value_counts()

No     1157
Yes     343
Name: RainTomorrow, dtype: int64

In [13]:
# 将标签编码
from sklearn.preprocessing import LabelEncoder
encorder = LabelEncoder().fit(Ytrain)

In [14]:
Ytrain = pd.DataFrame(encorder.transform(Ytrain))
Ytest = pd.DataFrame(encorder.transform(Ytest))

In [15]:
Ytest

Unnamed: 0,0
0,0
1,0
2,1
3,0
4,0
...,...
1495,0
1496,0
1497,0
1498,1


In [16]:
Xtrain.head()

Unnamed: 0.1,Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,...,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
0,1809,2015-08-24,Katherine,17.5,36.0,0.0,8.8,,ESE,26.0,...,17.0,15.0,57.0,,1016.8,1012.2,0.0,,27.5,
1,4176,2016-12-10,Tuggeranong,9.5,25.0,0.0,,,NNW,33.0,...,7.0,17.0,59.0,31.0,1020.4,1017.5,,,14.6,23.6
2,110,2010-04-18,Albany,13.0,22.6,0.0,3.8,10.4,,,...,17.0,31.0,79.0,68.0,1020.3,1015.7,1.0,3.0,17.5,20.8
3,3582,2009-11-26,Sale,13.9,29.8,0.0,5.8,5.1,S,37.0,...,11.0,28.0,82.0,44.0,1012.5,1005.9,6.0,6.0,18.5,27.5
4,2162,2014-04-25,Mildura,6.0,23.5,0.0,2.8,8.6,NNE,24.0,...,15.0,15.0,58.0,35.0,1019.8,1014.1,2.0,4.0,12.4,22.4


In [17]:
Xtrain = Xtrain.iloc[:, 1:]

In [18]:
Xtrain.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
0,2015-08-24,Katherine,17.5,36.0,0.0,8.8,,ESE,26.0,NNW,...,17.0,15.0,57.0,,1016.8,1012.2,0.0,,27.5,
1,2016-12-10,Tuggeranong,9.5,25.0,0.0,,,NNW,33.0,NE,...,7.0,17.0,59.0,31.0,1020.4,1017.5,,,14.6,23.6
2,2010-04-18,Albany,13.0,22.6,0.0,3.8,10.4,,,NE,...,17.0,31.0,79.0,68.0,1020.3,1015.7,1.0,3.0,17.5,20.8
3,2009-11-26,Sale,13.9,29.8,0.0,5.8,5.1,S,37.0,N,...,11.0,28.0,82.0,44.0,1012.5,1005.9,6.0,6.0,18.5,27.5
4,2014-04-25,Mildura,6.0,23.5,0.0,2.8,8.6,NNE,24.0,E,...,15.0,15.0,58.0,35.0,1019.8,1014.1,2.0,4.0,12.4,22.4


In [19]:
Xtest = Xtest.iloc[:, 1:]

In [20]:
Xtest

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
0,2016-01-23,NorahHead,22.0,27.8,25.2,,,SSW,57.0,S,...,2.0,37.0,91.0,86.0,1006.6,1008.1,,,26.2,23.1
1,2009-03-05,MountGambier,12.0,18.6,2.2,3.0,7.8,SW,52.0,SW,...,28.0,28.0,88.0,62.0,1020.2,1019.9,8.0,7.0,14.8,17.5
2,2010-03-05,MountGinini,9.1,13.3,,,,NE,41.0,,...,,,,,,,,,,
3,2013-10-26,Wollongong,13.1,20.3,0.0,,,SW,33.0,W,...,15.0,24.0,40.0,51.0,1021.3,1019.5,,,16.8,19.6
4,2016-11-28,Sale,12.2,20.0,0.4,,,E,33.0,SW,...,7.0,19.0,92.0,69.0,1015.6,1013.2,8.0,4.0,13.6,19.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,2015-05-29,Albury,9.1,15.8,11.2,,,WNW,30.0,,...,0.0,13.0,100.0,64.0,1022.6,1021.8,1.0,7.0,11.0,14.0
1496,2015-05-24,Uluru,9.6,21.1,0.0,,,E,26.0,ESE,...,15.0,4.0,36.0,23.0,1023.2,1020.9,8.0,8.0,14.2,20.3
1497,2015-11-20,SalmonGums,5.5,25.9,0.0,,,S,43.0,SE,...,17.0,17.0,44.0,25.0,,,,,18.4,24.9
1498,2012-08-16,WaggaWagga,2.6,15.4,0.0,1.6,5.2,NW,41.0,ENE,...,11.0,24.0,79.0,54.0,1015.9,1011.5,6.0,5.0,8.1,13.3


In [21]:
12+4+7+4+3+4+12+4+14+2

66

### 处理困难特征：日期

In [22]:
Xtrain.iloc[:,0].value_counts().count()

2141

In [23]:
Xtrain["Rainfall"].head(20)

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
5      0.0
6      0.0
7      0.2
8      0.0
9      0.2
10     1.0
11     0.0
12     0.2
13     0.0
14     0.0
15     3.0
16     0.2
17     0.0
18    35.2
19     0.0
Name: Rainfall, dtype: float64

In [24]:
# 创建新特征：今日是否下雨
# 下雨量超1是yes 不到1是no 空值依然作为空值
Xtrain.loc[Xtrain["Rainfall"] >= 1, "RainToday"] = "Yes"
Xtrain.loc[Xtrain["Rainfall"] < 1, "RainToday"] = "No"
Xtrain.loc[Xtrain["Rainfall"] == np.nan, "RainToday"] = np.nan

In [25]:
Xtrain.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,2015-08-24,Katherine,17.5,36.0,0.0,8.8,,ESE,26.0,NNW,...,15.0,57.0,,1016.8,1012.2,0.0,,27.5,,No
1,2016-12-10,Tuggeranong,9.5,25.0,0.0,,,NNW,33.0,NE,...,17.0,59.0,31.0,1020.4,1017.5,,,14.6,23.6,No
2,2010-04-18,Albany,13.0,22.6,0.0,3.8,10.4,,,NE,...,31.0,79.0,68.0,1020.3,1015.7,1.0,3.0,17.5,20.8,No
3,2009-11-26,Sale,13.9,29.8,0.0,5.8,5.1,S,37.0,N,...,28.0,82.0,44.0,1012.5,1005.9,6.0,6.0,18.5,27.5,No
4,2014-04-25,Mildura,6.0,23.5,0.0,2.8,8.6,NNE,24.0,E,...,15.0,58.0,35.0,1019.8,1014.1,2.0,4.0,12.4,22.4,No


In [26]:
Xtest.loc[Xtest["Rainfall"] >= 1, "RainToday"] = "Yes"
Xtest.loc[Xtest["Rainfall"] < 1, "RainToday"] = "No"
Xtest.loc[Xtest["Rainfall"] == np.nan, "RainToday"] = np.nan

In [27]:
Xtest.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,2016-01-23,NorahHead,22.0,27.8,25.2,,,SSW,57.0,S,...,37.0,91.0,86.0,1006.6,1008.1,,,26.2,23.1,Yes
1,2009-03-05,MountGambier,12.0,18.6,2.2,3.0,7.8,SW,52.0,SW,...,28.0,88.0,62.0,1020.2,1019.9,8.0,7.0,14.8,17.5,Yes
2,2010-03-05,MountGinini,9.1,13.3,,,,NE,41.0,,...,,,,,,,,,,
3,2013-10-26,Wollongong,13.1,20.3,0.0,,,SW,33.0,W,...,24.0,40.0,51.0,1021.3,1019.5,,,16.8,19.6,No
4,2016-11-28,Sale,12.2,20.0,0.4,,,E,33.0,SW,...,19.0,92.0,69.0,1015.6,1013.2,8.0,4.0,13.6,19.0,No


In [28]:
int(Xtrain.loc[0, "Date"].split('-')[1])

8

In [29]:
# 日期只留下月份
Xtrain["Date"] = Xtrain["Date"].apply(lambda x: int(x.split('-')[1]))

In [30]:
Xtrain.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,8,Katherine,17.5,36.0,0.0,8.8,,ESE,26.0,NNW,...,15.0,57.0,,1016.8,1012.2,0.0,,27.5,,No
1,12,Tuggeranong,9.5,25.0,0.0,,,NNW,33.0,NE,...,17.0,59.0,31.0,1020.4,1017.5,,,14.6,23.6,No
2,4,Albany,13.0,22.6,0.0,3.8,10.4,,,NE,...,31.0,79.0,68.0,1020.3,1015.7,1.0,3.0,17.5,20.8,No
3,11,Sale,13.9,29.8,0.0,5.8,5.1,S,37.0,N,...,28.0,82.0,44.0,1012.5,1005.9,6.0,6.0,18.5,27.5,No
4,4,Mildura,6.0,23.5,0.0,2.8,8.6,NNE,24.0,E,...,15.0,58.0,35.0,1019.8,1014.1,2.0,4.0,12.4,22.4,No


In [31]:
# 对列重命名
Xtrain = Xtrain.rename(columns={"Date":"Month"})

In [32]:
Xtrain.head()

Unnamed: 0,Month,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,8,Katherine,17.5,36.0,0.0,8.8,,ESE,26.0,NNW,...,15.0,57.0,,1016.8,1012.2,0.0,,27.5,,No
1,12,Tuggeranong,9.5,25.0,0.0,,,NNW,33.0,NE,...,17.0,59.0,31.0,1020.4,1017.5,,,14.6,23.6,No
2,4,Albany,13.0,22.6,0.0,3.8,10.4,,,NE,...,31.0,79.0,68.0,1020.3,1015.7,1.0,3.0,17.5,20.8,No
3,11,Sale,13.9,29.8,0.0,5.8,5.1,S,37.0,N,...,28.0,82.0,44.0,1012.5,1005.9,6.0,6.0,18.5,27.5,No
4,4,Mildura,6.0,23.5,0.0,2.8,8.6,NNE,24.0,E,...,15.0,58.0,35.0,1019.8,1014.1,2.0,4.0,12.4,22.4,No


In [33]:
Xtest["Date"] = Xtest["Date"].apply(lambda x:int(x.split('-')[1]))

In [34]:
Xtest = Xtest.rename(columns={"Date":"Month"})

### 处理困难特征：地点

In [138]:
cityll = pd.read_csv("cityll.csv", index_col=0)

In [139]:
cityll.head()

Unnamed: 0,City,Latitude,Longitude,Latitudedir,Longitudedir
0,Adelaide,34.9285°,138.6007°,"S,",E
1,Albany,35.0275°,117.8840°,"S,",E
2,Albury,36.0737°,146.9135°,"S,",E
3,Wodonga,36.1241°,146.8818°,"S,",E
4,AliceSprings,23.6980°,133.8807°,"S,",E


In [142]:
city_climate = pd.read_csv("Cityclimate.csv")

In [143]:
city_climate.head()

Unnamed: 0,City,Climate
0,Adelaide,Warm temperate
1,Albany,Mild temperate
2,Albury,"Hot dry summer, cool winter"
3,Wodonga,"Hot dry summer, cool winter"
4,AliceSprings,"Hot dry summer, warm winter"


In [147]:
float(cityll.loc[0,"Latitude"][:-1])

34.9285

In [148]:
cityll.loc[:, "Latitudedir"].value_counts()

S,    100
Name: Latitudedir, dtype: int64

In [149]:
cityll["Latituenum"] = cityll["Latitude"].apply(lambda x:float(x[:-1]))
cityll["Longitudenum"] = cityll["Longitude"].apply(lambda x:float(x[:-1]))

In [150]:
cityll.head()

Unnamed: 0,City,Latitude,Longitude,Latitudedir,Longitudedir,Latituenum,Longitudenum
0,Adelaide,34.9285°,138.6007°,"S,",E,34.9285,138.6007
1,Albany,35.0275°,117.8840°,"S,",E,35.0275,117.884
2,Albury,36.0737°,146.9135°,"S,",E,36.0737,146.9135
3,Wodonga,36.1241°,146.8818°,"S,",E,36.1241,146.8818
4,AliceSprings,23.6980°,133.8807°,"S,",E,23.698,133.8807


In [151]:
citylld = cityll.iloc[:,[0,5,6]]

In [152]:
citylld.head()

Unnamed: 0,City,Latituenum,Longitudenum
0,Adelaide,34.9285,138.6007
1,Albany,35.0275,117.884
2,Albury,36.0737,146.9135
3,Wodonga,36.1241,146.8818
4,AliceSprings,23.698,133.8807


In [159]:
#将city_climate中的气候添加到我们的citylld中
citylld["climate"] = city_climate.iloc[:,-1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [154]:
samplecity = pd.read_csv("samplecity.csv",index_col=0)

In [156]:
#我们对samplecity也执行同样的处理：去掉经纬度中度数的符号，并且舍弃我们的经纬度的方向
samplecity["Latitudenum"] = samplecity["Latitude"].apply(lambda x:float(x[:-1]))
samplecity["Longitudenum"] = samplecity["Longitude"].apply(lambda x:float(x[:-1]))
samplecityd = samplecity.iloc[:,[0,5,6]]

In [157]:
#首先使用radians将角度转换成弧度
from math import radians, sin, cos, acos
citylld.loc[:,"slat"] = citylld.iloc[:,1].apply(lambda x : radians(x))
citylld.loc[:,"slon"] = citylld.iloc[:,2].apply(lambda x : radians(x))
samplecityd.loc[:,"elat"] = samplecityd.iloc[:,1].apply(lambda x : radians(x))
samplecityd.loc[:,"elon"] = samplecityd.iloc[:,2].apply(lambda x : radians(x))

In [160]:
import sys
for i in range(samplecityd.shape[0]):
    slat = citylld.loc[:,"slat"]
    slon = citylld.loc[:,"slon"]
    elat = samplecityd.loc[i,"elat"]
    elon = samplecityd.loc[i,"elon"]
    dist = 6371.01 * np.arccos(np.sin(slat)*np.sin(elat) + 
                          np.cos(slat)*np.cos(elat)*np.cos(slon.values - elon))
    city_index = np.argsort(dist)[0]
    #每次计算后，取距离最近的城市，然后将最近的城市和城市对应的气候都匹配到samplecityd中
    samplecityd.loc[i,"closest_city"] = citylld.loc[city_index,"City"]
    samplecityd.loc[i,"climate"] = citylld.loc[city_index,"climate"]

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [161]:
#查看气候的分布
samplecityd["climate"].value_counts()

Warm temperate                       15
Mild temperate                       10
Cool temperate                        9
Hot dry summer, cool winter           6
High humidity summer, warm winter     4
Hot dry summer, warm winter           3
Warm humid summer, mild winter        2
Name: climate, dtype: int64

In [162]:
#确认无误后，取出样本城市所对应的气候，并保存
locafinal = samplecityd.iloc[:,[0,-1]]

In [164]:
locafinal.head()

Unnamed: 0,City,climate
0,Canberra,Cool temperate
1,Sydney,Warm temperate
2,Perth,Warm temperate
3,Darwin,"High humidity summer, warm winter"
4,Hobart,Cool temperate


In [165]:
locafinal.columns = ["Location","Climate"]

In [166]:
#在这里设定locafinal的索引为地点，是为了之后进行map的匹配
locafinal = locafinal.set_index(keys="Location")

In [167]:
locafinal.to_csv("samplelocation.csv")

In [35]:
locafinal = pd.read_csv("samplelocation.csv")

In [36]:
locafinal.head()

Unnamed: 0,Location,Climate
0,Canberra,Cool temperate
1,Sydney,Warm temperate
2,Perth,Warm temperate
3,Darwin,"High humidity summer, warm winter"
4,Hobart,Cool temperate


In [37]:
locafinal.iloc[:,0]

0             Canberra
1               Sydney
2                Perth
3               Darwin
4               Hobart
5             Brisbane
6             Adelaide
7              Bendigo
8           Townsville
9         AliceSprings
10        MountGambier
11          Launceston
12            Ballarat
13              Albany
14              Albury
15        PerthAirport
16    MelbourneAirport
17             Mildura
18       SydneyAirport
19           Nuriootpa
20                Sale
21            Watsonia
22         Tuggeranong
23            Portland
24             Woomera
25              Cairns
26               Cobar
27          Wollongong
28           GoldCoast
29          WaggaWagga
30       NorfolkIsland
31             Penrith
32          SalmonGums
33           Newcastle
34        CoffsHarbour
35         Witchcliffe
36            Richmond
37            Dartmoor
38           NorahHead
39       BadgerysCreek
40         MountGinini
41               Moree
42             Walpole
43         

In [38]:
Xtrain.head()

Unnamed: 0,Month,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,8,Katherine,17.5,36.0,0.0,8.8,,ESE,26.0,NNW,...,15.0,57.0,,1016.8,1012.2,0.0,,27.5,,No
1,12,Tuggeranong,9.5,25.0,0.0,,,NNW,33.0,NE,...,17.0,59.0,31.0,1020.4,1017.5,,,14.6,23.6,No
2,4,Albany,13.0,22.6,0.0,3.8,10.4,,,NE,...,31.0,79.0,68.0,1020.3,1015.7,1.0,3.0,17.5,20.8,No
3,11,Sale,13.9,29.8,0.0,5.8,5.1,S,37.0,N,...,28.0,82.0,44.0,1012.5,1005.9,6.0,6.0,18.5,27.5,No
4,4,Mildura,6.0,23.5,0.0,2.8,8.6,NNE,24.0,E,...,15.0,58.0,35.0,1019.8,1014.1,2.0,4.0,12.4,22.4,No


In [39]:
import re
Xtrain["Location"] = Xtrain["Location"].map(locafinal.iloc[:,0]).apply(lambda x: re.sub(",","",x.strip()))

AttributeError: 'float' object has no attribute 'strip'

In [40]:
Xtrainc = Xtrain.copy()
Xtrainc["Location"] = Xtrainc["Location"].map(locafinal.iloc[:,0])

In [44]:
Xtrainc["Location"].isnull().sum()

3500

In [45]:
Xtrainc.sum()

Month              22493.0
Location               0.0
MinTemp            42618.6
MaxTemp            81103.7
Rainfall            8622.6
Evaporation        11142.8
Sunshine           13440.5
WindGustSpeed     130058.0
WindSpeed9am       48684.0
WindSpeed3pm       63768.0
Humidity9am       238910.0
Humidity3pm       176030.0
Pressure9am      3209580.0
Pressure3pm      3202026.2
Cloud9am            9752.0
Cloud3pm            9645.0
Temp9am            59141.7
Temp3pm            74517.9
dtype: float64