# 1、Import libraries(导入库)

原数据：
[https://www.kaggle.com/kumarajarshi/life-expectancy-who/data](https://www.kaggle.com/kumarajarshi/life-expectancy-who/data)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
dataset = pd.read_csv('../input/life-expectancy-who/Life Expectancy Data.csv')
dataset.head()

In [None]:
dataset.info()

In [None]:
round(dataset.describe(),3)

In [None]:
dataset['Year'].unique()

# 2、Data preprocessing(数据预处理)

### 2.1 Check missing values(检查缺失值)

In [None]:
sns.heatmap(pd.isna(dataset))

In [None]:
# 删除空值大于3个的数据
# Delete data with more than 3 null values
dataset = dataset.dropna(thresh=17)
dataset.isna().sum().sort_values()

In [None]:
corr_data=dataset.corr()
corr_data

In [None]:
plt.figure(figsize=(15,12))
sns.heatmap(corr_data,annot=True)

In [None]:
def imputer_mean(feature,data=dataset):
    data[feature].fillna(data[feature].mean(),inplace=True)
    
features_miss= dataset.columns[dataset.isna().any()]
for feature in features_miss:
    imputer_mean(feature=feature)

In [None]:
dataset.isnull().sum()

### 2.2、Exploratory analysis(探索性分析)

In [None]:
sns.countplot(x='Status',data=dataset)

In [None]:
df_status = dataset.groupby('Status')
for sta,life in df_status:
    print(sta + ":" + str(life['Life expectancy '].mean()))

In [None]:
print('Top 10 developed countries with the longest life expectancy')
df_sta_ed = dataset[dataset.Status=='Developed'].groupby('Country')
print(df_sta_ed['Life expectancy '].mean().sort_values(ascending=False).head(10))
print('*'*50)
print('Top 10 countries with the longest life expectancy')
df_coun = dataset.groupby('Country')
print(df_coun['Life expectancy '].mean().sort_values(ascending=False).head(10))
print('*'*50)
print('Top 10 countries with the shortest life expectancy')
print(df_coun['Life expectancy '].mean().sort_values(ascending=False).tail(10))
print('*'*50)

In [None]:
df_ed = df_sta_ed['Life expectancy '].mean().sort_values(ascending=False).head(10)
dic = {'Country':df_ed.index,'Life expectancy ':df_ed.values}
df_ed = pd.DataFrame(dic)
df_long = df_coun['Life expectancy '].mean().sort_values(ascending=False).head(10)
dic = {'Country':df_long.index,'Life expectancy ':df_long.values}
df_long = pd.DataFrame(dic)
df_short = df_coun['Life expectancy '].mean().sort_values(ascending=False).tail(10)
dic = {'Country':df_short.index,'Life expectancy ':df_short.values}
df_short = pd.DataFrame(dic)
for df in [df_ed,df_long,df_short]:
    sns.barplot(x='Life expectancy ',y='Country',data=df)
    plt.show()

In [None]:
y = dataset['Life expectancy ']
X = dataset.drop('Life expectancy ',axis=1)
X.head()

In [None]:
# X.columns
for features in X.columns:
    if features == 'Status':
        pass
    else:
        sns.scatterplot(x=X[features],y=y,hue=X['Status'])
        plt.legend()
        plt.show()

# 3、Label Encoding And Dummy Variable(标签编码和虚拟变量)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X['Country'] = le.fit_transform(X['Country'])
X['Year'] = le.fit_transform(X['Year'])
# X.head(3)
# X.tail(3)

In [None]:
status_dum = pd.get_dummies(X['Status'])
X.drop(columns='Status',inplace=True)
# X.head(3)
X = pd.concat([X,status_dum],axis=1)
X.head(3)

# 4、Training and Testing the Model(训练测试模型)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression,Ridge
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

In [None]:
model_Forest = RandomForestRegressor(max_depth=10,n_estimators=500, min_samples_split=2)
model_Linear = LinearRegression()
model_Forest.fit(X_train,y_train)
model_Linear.fit(X_train,y_train)
print(f'RandomForestRegressor_scores is {model_Forest.score(X_test,y_test)}')
print(f'LinearRegression_scores is {model_Linear.score(X_test,y_test)}')

In [None]:
y_pred1 = np.array(model_Forest.predict(X_test))
y_pred2 = np.array(model_Linear.predict(X_test))

y_test = np.array(y_test)
print(np.concatenate((y_pred1.reshape(len(y_test),1),y_pred2.reshape(len(y_test),1), y_test.reshape(len(y_test),1)),1))

In [None]:
from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_test,y_pred1)**(0.5))
print(mean_squared_error(y_test,y_pred2)**(0.5))