# Getting Started

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, LogisticRegression

In [None]:
data = pd.read_csv('../input/dissolved-oxygen-prediction-in-river-water/train.csv')

In [None]:
data

In [None]:
data.info()

# Preprocessing

In [None]:
data.drop('Id', axis=1, inplace=True)

## Missing Values

In [None]:
data.isna().sum()

In [None]:
null_columns = list(data.columns[data.isna().sum() > 100])

data.drop(null_columns, axis=1, inplace=True)

In [None]:
data

In [None]:
data.isna().sum()

In [None]:
print("Columns with missing values:", (data.isna().sum(axis=0) != 0).sum())
print("Rows with missing values:", (data.isna().sum(axis=1) != 0).sum())

In [None]:
data.dropna(axis=0, inplace=True)

In [None]:
data.isna().sum().sum()

In [None]:
data

## Splitting and Scaling

In [None]:
y = data['target']
X = data.drop('target', axis=1)

In [None]:
scaler = StandardScaler()

X = scaler.fit_transform(X)

# Training (Regression)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
model_R2 = model.score(X_test, y_test)

In [None]:
print("Model R^2 Score:", model_R2)

# New Problem: Predicting High or Low Dissolved Oxygen

In [None]:
y

In [None]:
y.mean()

In [None]:
y_new = pd.qcut(y, q=2, labels=[0, 1])

# Training (Classification)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_new, train_size=0.7)

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)
model_acc = model.score(X_test, y_test)

In [None]:
print("Model Accuracy:", model_acc)

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/arfFLTtah_A