In [4]:
import numpy as np
import pandas as pd
from sklearn import preprocessing

In [5]:
df = pd.read_csv('TIMSS2011TW.csv')
df = df[['math','gender','math interest','math input','math hours','parental education','educational resources']]
df.head()

Unnamed: 0,math,gender,math interest,math input,math hours,parental education,educational resources
0,729.39375,girl,8.93041,9.15641,45min - 3hours,high school,9.60097
1,776.19646,girl,13.46507,12.42205,<= 45min,high school,8.91919
2,718.17348,girl,9.60333,10.15325,<= 45min,elementary school,6.33067
3,607.18468,girl,13.46507,8.70884,<= 45min,junior high school,10.25396
4,658.17594,girl,8.26761,7.85736,45min - 3hours,university above,10.92551


In [6]:
print(df['gender'].unique())
print(df['math hours'].unique())
print(df['parental education'].unique())

['girl' 'boy']
['45min - 3hours' '<= 45min' '>= 3hours']
['high school' 'elementary school' 'junior high school' 'university above'
 'college']


In [7]:
# mapping or onehot non-number data
# one-hot feature 'gender'
# mapping math hours, '<= 45min' to 30 min, '45min - 3hours' to 120 min,'>= 3hours' to 300 min.
# mapping parental education, 'elementary school' to 0, 'junior high school' to 1, 'high school' to 2,  'college' to 3, 'university above' to 4.
df['math hours'] = df['math hours'].map({'<= 45min':30,'45min - 3hours':120,'>= 3hours':300})
df['parental education'] = df['parental education'].map({'elementary school':0,'junior high school':1,'high school':2,'college':3,'university above':4})
df_onehot_gender = pd.get_dummies(data=df, columns=['gender'])
df_onehot_gender.head()

Unnamed: 0,math,math interest,math input,math hours,parental education,educational resources,gender_boy,gender_girl
0,729.39375,8.93041,9.15641,120,2,9.60097,0,1
1,776.19646,13.46507,12.42205,30,2,8.91919,0,1
2,718.17348,9.60333,10.15325,30,0,6.33067,0,1
3,607.18468,13.46507,8.70884,30,1,10.25396,0,1
4,658.17594,8.26761,7.85736,120,4,10.92551,0,1


In [24]:
df_onehot_gender.isna().values.any()

False

In [8]:
len(df_onehot_gender)

4467

In [9]:
# Get array to train
df_train = df_onehot_gender[:4000]
df_test = df_onehot_gender[4000:]
print('df_train.shape',df_train.shape,', df_test.shape',df_test.shape)

df_train.shape (4000, 8) , df_test.shape (467, 8)


In [10]:
def preprocess(raw_df):
    ndarray = raw_df.values
    feature = ndarray[:,1:]
    label = ndarray[:,0]
    minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
    scaledFeatures = minmax_scale.fit_transform(feature)
    return scaledFeatures, label

In [11]:
x_train, y_train = preprocess(df_train)
x_test, y_test = preprocess(df_test)
# reshape y_train and y_test
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

In [13]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import torchvision.transforms as transforms

In [71]:
class LR(nn.Module): # 繼承torch.nn.module特性
    
    # 架構
    def __init__(self):
        super(LR, self).__init__() # super(LR, self) 繼承自己的__init__()
        self.hidden1 = nn.Linear(7, 50) # input 9, hidden 40  Linear全連接
        self.dropout = nn.Dropout(0.7)
        self.hidden2 = nn.Linear(50, 30) # input 40, hidden 40  Linear全連接
        #self.hidden3 = nn.Linear(80, 60) # input 40, hidden 40  Linear全連接
        self.hidden4 = nn.Linear(30, 10) # input 40, hidden 30  Linear全連接
        self.hidden5 = nn.Linear(10, 1) # hidden 10, output 1

    # 運作流程（資料流）
    def forward(self, x):  
        x = F.relu(self.hidden1(x)) # 讓x由hidden1後，進入activation function
        x = F.dropout(self.dropout(x))
        x = F.relu(self.hidden2(x))
        #x = F.dropout(self.dropout(x))
        #x = F.relu(self.hidden3(x))
        #x = F.dropout(self.dropout(x))
        x = F.relu(self.hidden4(x))
        out = self.hidden5(x)
        return out

In [73]:
model = LR() # network model
loss_func = nn.L1Loss() 
learning_rate = 0.03
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) #model.parameters() 所有訓練的參數都在這
# model.cuda()

In [79]:
epochs = 5

In [80]:
# convert to variables

x = Variable(torch.from_numpy(x_train).float())
y = Variable(torch.from_numpy(y_train).float())
#print(x)

for epoch in range(epochs):    
    # clear gradient w.r.t. parameters 
    optimizer.zero_grad()
    
    # forward to get output
    prediction = model(x)
    # calculate loss
    loss = loss_func(prediction, y)

    # backward to get gradient
    loss.backward()

    # update parameters
    optimizer.step() 

    if epoch % 100 == 0:
    # plot and show learning process
        print("epoch %d, loss %.8f" % (epoch, loss.data[0]))

epoch 0, loss 79.06218719




In [81]:
train_prediction = prediction.data.numpy()
df_train['prediction'] = train_prediction
df_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,math,math interest,math input,math hours,parental education,educational resources,gender_boy,gender_girl,prediction
0,729.39375,8.93041,9.15641,120,2,9.60097,0,1,622.759216
1,776.19646,13.46507,12.42205,30,2,8.91919,0,1,622.759216
2,718.17348,9.60333,10.15325,30,0,6.33067,0,1,622.759216
3,607.18468,13.46507,8.70884,30,1,10.25396,0,1,622.759216
4,658.17594,8.26761,7.85736,120,4,10.92551,0,1,622.759216
5,478.57630,6.36452,7.85736,30,3,10.92551,0,1,622.759216
6,675.60044,10.80246,9.15641,30,4,11.64917,0,1,622.759216
7,601.04247,7.91186,7.42878,120,1,9.60097,0,1,622.759216
8,628.61673,9.96500,10.15325,30,2,10.92551,0,1,622.759216
9,639.98171,10.35777,10.15325,30,2,9.60097,0,1,622.759216


在使用和助教相同的Loss function和神經層數目的情況下，他的Loss在一開始就呈現overfloat的狀態，在我們刪減一些神經層之後，終於從Nan變成有數字的結果，但是Loss卻在很高的地方就不再下降了，因此我們嘗試改用不同的Loss Function, 結果Loss的scale比原來的要小很多，最後也有收斂，本來以為可以告一段落，卻發現預測結果和實際結果差了很多，且預測結果全部都是同樣的數值，我們完全一頭霧水，只能先暫時坐到這裡。