# 因子分析

## 分析の手順
1. 使用するデータセットを確認する
2. 因子負荷量を求める
3. 共通因子の意味を解釈する
4. 分析結果の精度を確認する
5. 因子得点を求める

## 参考にする資料
- マンガでわかる統計学「因子分析編」 高橋信 オーム社
- https://istat.co.jp/ta_commentary/factor_analysis

---

## １．使用するデータセットの確認
取得元：http://www.statistics.co.jp/reference/statistical_data/statistical_data.htm

### 分析の目的
**各科目の得点から、共通因子を探る**

---

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import FactorAnalysis as FA
%matplotlib inline

In [2]:
# データのインポート
df = pd.read_csv('seiseki.csv')
df.head()

Unnamed: 0,kokugo,shakai,sugaku,rika,ongaku,bijutu,taiiku,gika,eigo
0,30,43,51,63,60,66,37,44,20
1,39,21,49,56,70,72,56,63,16
2,29,30,23,57,69,76,33,54,6
3,95,87,77,100,77,82,78,96,87
4,70,71,78,67,72,82,46,63,44


In [3]:
df.describe()

Unnamed: 0,kokugo,shakai,sugaku,rika,ongaku,bijutu,taiiku,gika,eigo
count,166.0,166.0,166.0,166.0,166.0,166.0,166.0,166.0,166.0
mean,52.331325,39.60241,45.614458,49.885542,42.331325,62.457831,57.740964,47.307229,39.084337
std,21.835858,21.456335,24.236127,21.40813,22.894855,17.188298,26.577943,22.519397,29.507351
min,0.0,4.0,0.0,9.0,2.0,15.0,2.0,2.0,0.0
25%,35.0,23.0,28.0,31.0,25.25,52.0,35.25,31.25,12.0
50%,53.0,35.0,43.0,49.0,41.5,65.0,60.0,48.5,35.0
75%,70.0,53.75,63.75,65.75,58.75,75.75,82.0,65.0,66.75
max,96.0,90.0,100.0,100.0,96.0,99.0,99.0,100.0,99.0


---

## ２．因子負荷量を求める

In [4]:
from sklearn.preprocessing import StandardScaler
# 標準化する
sc = StandardScaler()
values_std = sc.fit_transform(df.values)
values_std



array([[-1.0257849 ,  0.15882818,  0.22288369, ..., -0.78274382,
        -0.14730571, -0.64872246],
       [-0.61237172, -0.86961243,  0.14011256, ..., -0.06570228,
         0.69896425, -0.78469207],
       [-1.0717197 , -0.44888673, -0.93591204, ..., -0.93369993,
         0.29809953, -1.12461609],
       ..., 
       [-0.33676294, -0.63587593, -0.68759867, ...,  0.38716605,
        -0.32546781, -0.41077565],
       [ 0.94941138, -0.40213943, -0.10820081, ...,  0.91551245,
         0.03085639,  0.575004  ],
       [ 0.35225902,  2.1222148 ,  1.79553503, ...,  0.87777342,
        -0.10276519,  1.56078365]])

In [5]:
from factor_analyzer import FactorAnalyzer

In [6]:
df_std = pd.DataFrame(values_std)
df_std.index = df.index
df_std.columns = df.columns

In [7]:
fa = FactorAnalyzer()
fa.analyze(df_std, 2, rotation='varimax')
# pythonからは因子得点を得られない
# fa.analyze(df_std, 2, rotation='varimax', scores='regression')

In [8]:
# 因子負荷量
fa.loadings

Unnamed: 0,Factor1,Factor2
kokugo,-0.782942,0.42747
shakai,-0.894137,0.148634
sugaku,-0.868325,0.110725
rika,-0.938303,0.032381
ongaku,-0.799222,0.276312
bijutu,-0.615743,0.465988
taiiku,-0.073889,0.838507
gika,-0.728524,0.133834
eigo,-0.821381,0.237177


In [9]:
#  固有値
fa.get_eigenvalues()

(   Original_Eigenvalues
 0              6.006431
 1              1.098185
 2              0.490842
 3              0.406976
 4              0.300260
 5              0.221459
 6              0.182787
 7              0.171195
 8              0.121864,    Common_Factor_Eigenvalues
 0                   5.750859
 1                   0.813676
 2                   0.131451
 3                   0.103217
 4                   0.010771
 5                  -0.005481
 6                  -0.056416
 7                  -0.075562
 8                  -0.107980)

In [10]:
# 独自因子
fa.get_uniqueness()

Unnamed: 0,Uniqueness
kokugo,0.204272
shakai,0.178426
sugaku,0.233751
rika,0.118539
ongaku,0.284895
bijutu,0.403715
taiiku,0.291446
gika,0.451341
eigo,0.26908


In [11]:
# 寄与度、寄与率、累積寄与率
fa.get_factor_variance()

Unnamed: 0,Factor1,Factor2
SS Loadings,5.275652,1.288883
Proportion Var,0.586184,0.143209
Cumulative Var,0.586184,0.729393
