# 幸福感预测

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

### 基础设置

In [2]:
# 显示所有列
pd.set_option('display.max_columns', None)
# 显示所有行
pd.set_option('display.max_rows', None)

### 1. 导入数据

In [3]:
# 导入数据
train = pd.read_csv("./data/happiness_train_abbr.csv")
test = pd.read_csv("./data/happiness_test_abbr.csv")

### 2. 查看数据

In [4]:
# 查看数据形状
train.shape

(8000, 42)

In [5]:
test.shape

(2968, 41)

In [6]:
# 简单查看几条数据
train.head()

Unnamed: 0,id,happiness,survey_type,province,city,county,survey_time,gender,birth,nationality,religion,religion_freq,edu,income,political,floor_area,height_cm,weight_jin,health,health_problem,depression,hukou,socialize,relax,learn,equity,class,work_exper,work_status,work_yr,work_type,work_manage,family_income,family_m,family_status,house,car,marital,status_peer,status_3_before,view,inc_ability
0,1,4,1,12,32,59,2015/8/4 14:18,1,1959,1,1,1,11,20000,1,45.0,176,155,3,2,5,5,2,4,3,3,3,1,3.0,30.0,1.0,2.0,60000.0,2,2,1,2,3,3,2,4,3
1,2,4,2,18,52,85,2015/7/21 15:04,1,1992,1,1,1,12,20000,1,110.0,170,110,5,4,3,1,2,4,3,3,6,1,3.0,2.0,1.0,3.0,40000.0,3,4,1,2,1,1,1,4,2
2,3,4,2,29,83,126,2015/7/21 13:24,2,1967,1,0,3,4,2000,1,120.0,160,122,4,4,5,1,3,4,2,4,5,2,,,,,8000.0,3,3,1,2,3,2,1,4,2
3,4,5,2,10,28,51,2015/7/25 17:33,2,1943,1,1,1,3,6420,1,78.0,163,170,4,4,4,1,2,4,4,4,5,4,,,,,12000.0,3,3,1,1,7,2,1,3,2
4,5,4,1,7,18,36,2015/8/10 9:50,2,1994,1,1,1,12,-1,2,70.0,165,110,5,5,3,2,4,3,4,2,1,6,,,,,-2.0,4,3,1,1,1,3,2,3,-8


In [7]:
# 查看数据是缺失程度
train.info(verbose=True,null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 42 columns):
id                 8000 non-null int64
happiness          8000 non-null int64
survey_type        8000 non-null int64
province           8000 non-null int64
city               8000 non-null int64
county             8000 non-null int64
survey_time        8000 non-null object
gender             8000 non-null int64
birth              8000 non-null int64
nationality        8000 non-null int64
religion           8000 non-null int64
religion_freq      8000 non-null int64
edu                8000 non-null int64
income             8000 non-null int64
political          8000 non-null int64
floor_area         8000 non-null float64
height_cm          8000 non-null int64
weight_jin         8000 non-null int64
health             8000 non-null int64
health_problem     8000 non-null int64
depression         8000 non-null int64
hukou              8000 non-null int64
socialize          8000 non-null

In [8]:
#去掉四个缺失值很多的
train=train.drop(["work_status"], axis=1)
train=train.drop(["work_yr"], axis=1)
train=train.drop(["work_type"], axis=1)
train=train.drop(["work_manage"], axis=1)

### 3. 分割数据

In [9]:
# 分割数据集
x_train, x_test, y_train, y_test = train_test_split(train.iloc[:, 2:], train["happiness"], test_size=0.25)

### 4. 特征工程

In [10]:
# 查看label分布
y_train.value_counts()

 4    3612
 5    1078
 3     858
 2     372
 1      73
-8       7
Name: happiness, dtype: int64

In [11]:
#将-8换成3  -8 是拒绝回答，3是说不上幸福不幸福
y_train=y_train.map(lambda x:3 if x==-8 else x)

In [12]:
y_train.value_counts()

4    3612
5    1078
3     865
2     372
1      73
Name: happiness, dtype: int64

In [13]:
# 查看 训练集
x_train.info(verbose=True,null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6000 entries, 7757 to 544
Data columns (total 36 columns):
survey_type        6000 non-null int64
province           6000 non-null int64
city               6000 non-null int64
county             6000 non-null int64
survey_time        6000 non-null object
gender             6000 non-null int64
birth              6000 non-null int64
nationality        6000 non-null int64
religion           6000 non-null int64
religion_freq      6000 non-null int64
edu                6000 non-null int64
income             6000 non-null int64
political          6000 non-null int64
floor_area         6000 non-null float64
height_cm          6000 non-null int64
weight_jin         6000 non-null int64
health             6000 non-null int64
health_problem     6000 non-null int64
depression         6000 non-null int64
hukou              6000 non-null int64
socialize          6000 non-null int64
relax              6000 non-null int64
learn              6000 non-nu

In [14]:
x_train

Unnamed: 0,survey_type,province,city,county,survey_time,gender,birth,nationality,religion,religion_freq,edu,income,political,floor_area,height_cm,weight_jin,health,health_problem,depression,hukou,socialize,relax,learn,equity,class,work_exper,family_income,family_m,family_status,house,car,marital,status_peer,status_3_before,view,inc_ability
7757,1,28,81,123,2015/7/29 15:19,2,1969,1,1,1,6,12000,1,200.0,160,110,5,4,4,4,2,2,2,2,5,5,48000.0,5,2,1,2,3,3,2,5,4
2324,2,18,55,89,2015/7/24 9:55,2,1948,1,0,2,1,0,1,220.0,157,110,2,2,2,1,4,4,1,4,5,4,-2.0,5,3,1,2,3,2,1,-8,-8
355,2,10,27,50,2015/7/29 9:05,1,1952,1,1,1,6,0,1,130.0,170,140,4,4,4,1,2,3,1,4,5,5,50000.0,4,3,1,2,3,2,2,4,2
968,1,5,8,24,2015/8/4 15:28,1,1957,1,1,1,4,20000,1,60.0,175,140,4,5,5,1,4,3,1,2,2,1,20000.0,2,2,1,2,3,3,2,3,3
5760,1,6,17,33,2015/8/5 17:05,2,1949,1,1,1,3,24000,1,70.0,161,102,2,2,3,5,3,3,1,2,4,5,24000.0,1,3,1,2,7,2,1,4,3
6587,2,18,55,89,2015/7/24 9:44,2,1953,1,1,1,1,1720,1,180.0,160,110,4,4,5,1,4,3,1,4,5,3,50000.0,1,3,1,2,3,2,1,3,2
2451,1,6,15,31,2015/8/11 14:57,1,1967,1,1,1,4,30000,1,40.0,165,105,4,4,4,1,3,2,1,2,4,3,50000.0,5,2,1,2,3,2,2,4,3
2576,1,28,81,123,2015/7/29 17:32,2,1952,1,1,1,3,11000,-8,57.0,163,110,3,4,5,4,3,4,2,3,5,3,20000.0,2,3,1,2,3,2,1,4,2
7221,2,21,62,98,2015/7/16 11:08,1,1971,1,1,1,3,40000,1,220.0,170,138,5,5,5,1,4,4,1,2,5,3,-2.0,9,3,1,2,3,2,2,3,2
4194,1,16,45,76,2015/7/25 8:25,2,1949,1,1,1,1,4800,1,100.0,150,126,2,2,3,2,1,4,1,4,5,4,12000.0,2,3,1,2,3,3,2,3,2
