In [2]:
#import rapids libraries
import cupy
import cudf
import cuml

In [3]:
#data is read in dataframe
df=cudf.read_csv('../input/netflix-appetency/train.csv')
test=cudf.read_csv('../input/netflix-appetency/test.csv')

In [4]:
df

In [5]:
#drop date time columns
date_cols = ['feature_191', 'feature_192', 'feature_194', 'feature_195', 'feature_199', 'feature_200', 'feature_201', 'feature_202', 'feature_203', 'feature_204']
df = df.drop(columns=date_cols)
test = test.drop(columns=date_cols)


In [6]:
# drop columns that have more than half null values
for c in df:
    if df[c].isnull().sum()/70000>0.5:
        df.drop(c,axis='columns', inplace=True)
        test.drop(c,axis='columns', inplace=True)

In [7]:
df

In [8]:
#fill null values with mean, mode or median
for c in df:
    df[c].fillna(df[c].mode()[0],inplace=True)
for c in test:
    test[c].fillna(test[c].mode()[0],inplace=True)

In [9]:
#label encoding of object type
from cuml.preprocessing import LabelEncoder
lb=LabelEncoder()
obj_c=list(df.select_dtypes(['object']).columns)
for c in obj_c:
    df[c]=lb.fit_transform(df[c])
    
obj_c_test=list(test.select_dtypes(['object']).columns)
for c in obj_c_test:
    test[c]=lb.fit_transform(test[c])

In [10]:
#drop the columns having variance less than 0.2
for c in df:
    if df.var()[c]<0.2:
        df.drop(c,axis='columns', inplace=True)
        test.drop(c,axis='columns', inplace=True)
df

In [11]:
#apply standard scaler
x=df.iloc[:,2:]
y=df.iloc[:,1]
x_test=test.iloc[:,1:]
from cuml.preprocessing import StandardScaler
ss=StandardScaler()
x_scaled=ss.fit_transform(x)
x_test_scaled=ss.fit_transform(x_test)

In [12]:
x_scaled

In [13]:
from cuml.model_selection import train_test_split
x_train1, x_test1, y_train1, y_test1 = train_test_split(x_scaled, y, train_size=0.7, random_state=42)

In [14]:
#apply logistic regression to calculate accuracy
from cuml.linear_model import LogisticRegression
reg = LogisticRegression()
reg.fit(x_train1,y_train1)
preds = reg.predict(x_test1)
    #print(preds.dtype)
print("MSE:",cuml.metrics.regression.mean_squared_error(y_test1,preds))
print("R2 Score:",cuml.metrics.regression.r2_score(y_test1.astype('float64'),preds))
print("MAE:",cuml.metrics.regression.mean_absolute_error(y_test1,preds))
print("Accuracy:",cuml.metrics.accuracy_score(y_test1.astype('float64'),preds))

In [15]:
#apply random forest to calculate accuracy
from cuml.ensemble import RandomForestClassifier as cuRF
model = cuRF( max_depth = 25000,
              n_estimators = 250,
              random_state  = 0 )

trained_RF = model.fit ( x_train1, y_train1 )

predictions = model.predict ( x_test1 )
print("Accuracy:",cuml.metrics.accuracy_score(y_test1.astype('float64'),predictions))

In [16]:
#various models implemented to calculate best result
#but the best result is given by the logistic regression so we have used it
final_model1 = LogisticRegression(max_iter=5000)
final_model2 = cuml.svm.SVC( class_weight='balanced', probability=True, kernel='rbf', gamma='auto')
final_model3 =cuml.neighbors.KNeighborsClassifier(n_neighbors=100)
trained_model = final_model1.fit ( x_scaled, y )

final_predictions = trained_model.predict_proba ( x_test_scaled )

In [17]:

clf = cuRF(n_estimators=100, max_depth=10, random_state=42)

classifier = clf.fit(x_scaled.astype('float32'),y.astype('float32'))
predictions3 = classifier.predict_proba(x_test_scaled.astype('float32'))
print(predictions3)

In [18]:
final_predictions

In [19]:
#creation of new dataframe
df_ans=cudf.DataFrame()
df_ans['id']=test['id']
df_ans['target']=final_predictions[1]
df_ans


In [20]:
filename='submission.csv'
df_ans.to_csv(filename,index=False)