# Task for Today  

***

## Rare Star Prediction  

Given *data about stars*, let's try to predict if a given star is a **pulsar star** or not.  
  
We will use a logistic regression model to make our predictions.

# Getting Started

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, classification_report

In [1]:
data = pd.read_csv('../input/predicting-pulsar-starintermediate/pulsar_data_train.csv')

In [1]:
data

In [1]:
data.info()

# Preprocessing

In [1]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Fill missing values
    for column in df.columns:
        df[column] = df[column].fillna(df[column].mean())
    
    # Split df into X and y
    y = df['target_class']
    X = df.drop('target_class', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [1]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [1]:
X_train

In [1]:
y_train

# Training/Results

In [1]:
def evaluate_model(model, X_test, y_test):
    acc = model.score(X_test, y_test)
    print("Test Accuracy: {:.2f}%".format(acc * 100))
    
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    clr = classification_report(y_test, y_pred, target_names=["NORMAL", "PULSAR"])
    
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt='g', vmin=0, cmap='Blues', cbar=False)
    plt.xticks(np.arange(2) + 0.5, ["NORMAL", "PULSAR"])
    plt.yticks(np.arange(2) + 0.5, ["NORMAL", "PULSAR"])
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.show()
    
    print("Classification Report:\n---------------------\n", clr)

In [1]:
model = LogisticRegression(
    class_weight={
        0: 1.0,
        1: 3.0
    }
)
model.fit(X_train, y_train)

evaluate_model(model, X_test, y_test)

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/H_QNyFF4lBo