# Data Mining

In [1]:
import numpy as np
import pandas as pd

<h1>Cleaning

In [2]:
# Import the data
df = pd.read_csv('youtube.csv')
df

Unnamed: 0,Rank,Grade,Channel name,Video Uploads,Subscribers,Video views
0,1st,A++,Zee TV,82757,18752951,20869786591
1,2nd,A++,T-Series,12661,61196302,47548839843
2,3rd,A++,Cocomelon - Nursery Rhymes,373,19238251,9793305082
3,4th,A++,SET India,27323,31180559,22675948293
4,5th,A++,WWE,36756,32852346,26273668433
...,...,...,...,...,...,...
4995,"4,996th",B+,Uras Benlioğlu,706,2072942,441202795
4996,"4,997th",B+,HI-TECH MUSIC LTD,797,1055091,377331722
4997,"4,998th",B+,Mastersaint,110,3265735,311758426
4998,"4,999th",B+,Bruce McIntosh,3475,32990,14563764


In [3]:
# drop Rank, Grade, and 'Channel name' columns
df.drop(['Rank', 'Grade', 'Channel name'], axis=1, inplace=True)

In [4]:
df.head()

Unnamed: 0,Video Uploads,Subscribers,Video views
0,82757,18752951,20869786591
1,12661,61196302,47548839843
2,373,19238251,9793305082
3,27323,31180559,22675948293
4,36756,32852346,26273668433


In [5]:
# Rename the columns
df.rename(columns={'Video Uploads': 'Videos', 'Subscribers': 'Subscribers', 'Video views' : 'Views'}, inplace=True)
df.head()

Unnamed: 0,Videos,Subscribers,Views
0,82757,18752951,20869786591
1,12661,61196302,47548839843
2,373,19238251,9793305082
3,27323,31180559,22675948293
4,36756,32852346,26273668433


In [6]:
# Check the unique values in all the columns
df.nunique()

# Change dataframe to numeric
df = df.apply(pd.to_numeric, errors='coerce')
df.dropna(inplace=True)
# Change dtype to int
# df.astype(int)

# Check the data types of all the columns
df.dtypes

Videos         float64
Subscribers    float64
Views            int64
dtype: object

In [7]:
# Evaluate the data into logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split as split

logistic_regression = LogisticRegression(solver='liblinear')

# If the views are greater than 1000000, then the channel is popular
df['Views'] = np.where(df['Views'] > 150000000, 1, 0)

# If the subscribers are greater than 1000000, then the channel is popular
df['Subscribers'] = np.where(df['Subscribers'] > 5000000, 1, 0)

# If the videos are greater than 350, then the channel is popular
df['Videos'] = np.where(df['Videos'] > 1000, 1, 0)


# Split the data into train and test sets
X_trainset, X_testset, y_trainset, y_testset = split(
    df[['Videos', 'Subscribers']], 
    df['Views'], 
    test_size=0.3, 
    random_state=3
)

logistic_regression.fit(X_trainset, y_trainset)


In [8]:
# Predict the values
y_predict = logistic_regression.predict(X_testset)

# Check the accuracy of the model
from sklearn.metrics import accuracy_score
accuracy_score(y_testset, y_predict)

# Save the model
import pickle
pickle.dump(logistic_regression, open('model.pkl', 'wb'))