# Import

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load Dataset

In [2]:
data = pd.read_csv('Indeks Standar Pencemar Udara.csv')

data.head()

Unnamed: 0,tanggal,pm10,pm25,so2,co,o3,no2,max,critical,categori,location
0,1/1/2021,43,,58,29,35,65,65,O3,SEDANG,DKI2
1,1/2/2021,58,,86,38,64,80,86,PM25,SEDANG,DKI3
2,1/3/2021,64,,93,25,62,86,93,PM25,SEDANG,DKI3
3,1/4/2021,50,,67,24,31,77,77,O3,SEDANG,DKI2
4,1/5/2021,59,,89,24,35,77,89,PM25,SEDANG,DKI3


# Selection

In [3]:
# artibut yang dihilangkan adalah tanggal, stasiun dan critical

data_selection = data.drop(columns=['tanggal','critical', 'location'])
data_selection.head()

Unnamed: 0,pm10,pm25,so2,co,o3,no2,max,categori
0,43,,58,29,35,65,65,SEDANG
1,58,,86,38,64,80,86,SEDANG
2,64,,93,25,62,86,93,SEDANG
3,50,,67,24,31,77,77,SEDANG
4,59,,89,24,35,77,89,SEDANG


# Cleaning

In [4]:
# pembersihan data missing value

data_cleaning = data_selection.dropna()
data_cleaning.head()

Unnamed: 0,pm10,pm25,so2,co,o3,no2,max,categori
31,73,126.0,38,26,46,34,126,TIDAK SEHAT
32,53,70.0,40,14,55,25,70,SEDANG
33,32,53.0,40,11,42,19,53,SEDANG
34,36,59.0,40,14,47,24,59,SEDANG
35,29,51.0,40,14,45,35,51,SEDANG


In [5]:
data_cleaning.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 334 entries, 31 to 364
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pm10      334 non-null    int64  
 1   pm25      334 non-null    float64
 2   so2       334 non-null    int64  
 3   co        334 non-null    int64  
 4   o3        334 non-null    int64  
 5   no2       334 non-null    int64  
 6   max       334 non-null    int64  
 7   categori  334 non-null    object 
dtypes: float64(1), int64(6), object(1)
memory usage: 23.5+ KB


# Ganti Type Data

In [6]:
data_type = data_cleaning.astype({"pm25": 'int'})
data_type.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 334 entries, 31 to 364
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   pm10      334 non-null    int64 
 1   pm25      334 non-null    int32 
 2   so2       334 non-null    int64 
 3   co        334 non-null    int64 
 4   o3        334 non-null    int64 
 5   no2       334 non-null    int64 
 6   max       334 non-null    int64 
 7   categori  334 non-null    object
dtypes: int32(1), int64(6), object(1)
memory usage: 22.2+ KB


In [7]:
data_type.head()

Unnamed: 0,pm10,pm25,so2,co,o3,no2,max,categori
31,73,126,38,26,46,34,126,TIDAK SEHAT
32,53,70,40,14,55,25,70,SEDANG
33,32,53,40,11,42,19,53,SEDANG
34,36,59,40,14,47,24,59,SEDANG
35,29,51,40,14,45,35,51,SEDANG


In [8]:
data_type['categori'].value_counts()

SEDANG         195
TIDAK SEHAT    136
BAIK             3
Name: categori, dtype: int64

In [9]:
X = data_type.drop (columns='categori', axis=1)
Y = data_type['categori']
print(X)

     pm10  pm25  so2  co  o3  no2  max
31     73   126   38  26  46   34  126
32     53    70   40  14  55   25   70
33     32    53   40  11  42   19   53
34     36    59   40  14  47   24   59
35     29    51   40  14  45   35   51
..    ...   ...  ...  ..  ..  ...  ...
360    75   121   61  23  40   47  121
361    59    89   53  16  34   33   89
362    61    98   54  15  37   29   98
363    60   102   53  17  38   44  102
364    64    90   52  44  37   53   90

[334 rows x 7 columns]


In [10]:
print(Y)

31     TIDAK SEHAT
32          SEDANG
33          SEDANG
34          SEDANG
35          SEDANG
          ...     
360    TIDAK SEHAT
361         SEDANG
362         SEDANG
363    TIDAK SEHAT
364         SEDANG
Name: categori, Length: 334, dtype: object


# DATA TRAINING DAN DATA TESTING

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [12]:
print(X.shape, X_train.shape, X_test.shape)

(334, 7) (267, 7) (67, 7)


# MODEL TRAINING

In [13]:
model = LogisticRegression()

model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

# EVALUASI TINGKAT AKURASI

In [14]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [15]:
print('Akurasi data training =', training_data_accuracy)

Akurasi data training = 0.8913857677902621


In [16]:
X_test_prediction = model.predict(X_test)
training_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [17]:
print('Akurasi data testing =', training_data_accuracy)


Akurasi data testing = 0.8955223880597015


# MODEL

In [20]:
input_data = (29, 45, 42, 7, 41, 12, 45)
input_data_as_numpy_array = np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction==['BAIK']):
  print('baik')
elif (prediction==['SEDANG']):
  print('sedang')
else:
  print('tidak sehat')

['BAIK']
baik




SIMPAN MODEL

In [19]:
import pickle 

filename = 'udara_model.sav'
pickle.dump(model, open(filename, 'wb'))