In [6]:
import pandas as pd
from sklearn.metrics import roc_auc_score

trn = "C:/Users/satra/Downloads/jigsaw-agile-community-rules/train.csv"
tst = "C:/Users/satra/Downloads/jigsaw-agile-community-rules/test.csv"
df_trn = pd.read_csv(trn)
df_tst = pd.read_csv(tst)

In [34]:
trn_rows = []
for idx, row in df_trn.iterrows():
  trn_rows.append({
    'body': row['positive_example_1'],
    'rule': row['rule'],
    'subreddit': row['subreddit'],
    'label': 1
  })

  trn_rows.append({
    'body': row['positive_example_2'],
    'rule': row['rule'],
    'subreddit': row['subreddit'],
    'label': 1
  })

  trn_rows.append({
    'body': row['negative_example_1'],
    'rule': row['rule'],
    'subreddit': row['subreddit'],
    'label': 0
  })

  trn_rows.append({
    'body': row['negative_example_2'],
    'rule': row['rule'],
    'subreddit': row['subreddit'],
    'label': 0
  })

trn_df = pd.DataFrame(trn_rows)

val_rows = []
for idx, row in df_trn.iterrows():
  val_rows.append({
    'body': row['body'],
    'rule': row['rule'],
    'subreddit': row['subreddit'],
    'label': row['rule_violation']
  })

val_df = pd.DataFrame(val_rows)

tst_rows = []
for idx, row in df_tst.iterrows():
  tst_rows.append({
    'body': row['body'],
    'rule': row['rule'],
    'subreddit': row['subreddit']
  })

tst_df = pd.DataFrame(tst_rows)
print (f'Train shape: {trn_df.shape}, Val shape: {val_df.shape}, Test shape: {tst_df.shape}')

Train shape: (8116, 4), Val shape: (2029, 4), Test shape: (10, 3)


In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

body_vec = TfidfVectorizer(max_features=10000)
rule_vec = TfidfVectorizer()
sred_vec = TfidfVectorizer()

X_body_trn = body_vec.fit_transform(trn_df['body'])
X_rule_trn = rule_vec.fit_transform(trn_df['rule'])
X_sred_trn = sred_vec.fit_transform(trn_df['subreddit'])

X_body_val = body_vec.transform(val_df['body'])
X_rule_val = rule_vec.transform(val_df['rule'])
X_sred_val = sred_vec.transform(val_df['subreddit'])

X_body_tst = body_vec.transform(tst_df['body'])
X_rule_tst = rule_vec.transform(tst_df['rule'])
X_sred_tst = sred_vec.transform(tst_df['subreddit'])

Y_trn = trn_df['label']
Y_val = val_df['label']

In [37]:
from scipy.sparse import hstack, csr_matrix

X_trn = hstack([X_body_trn, X_rule_trn, X_sred_trn]).tocsr()
X_val = hstack([X_body_val, X_rule_val, X_sred_val]).tocsr()
X_tst = hstack([X_body_tst, X_rule_tst, X_sred_tst]).tocsr()

print(f'X_trn shape: {X_trn.shape}, X_val shape: {X_val.shape}, X_tst shape: {X_tst.shape}')

X_trn shape: (8116, 5746), X_val shape: (2029, 5746), X_tst shape: (10, 5746)


In [39]:
import numpy as np
import lightgbm as lgb

trn_data = lgb.Dataset(X_trn, label=Y_trn)
val_data = lgb.Dataset(X_val, label=Y_val)

params = {'objective': 'binary', 'metric': 'auc', 'random_state': 42,'n_estimators': 4096, 'learning_rate': .008}
model = lgb.train(params, trn_data, num_boost_round=4096, valid_sets=[val_data], callbacks=[lgb.early_stopping(stopping_rounds=32)])

prob = np.round(model.predict(X_val), 6)
auc = roc_auc_score(Y_val, prob)
print(f'Probabilities: {len(prob)} {prob} AUC: {auc}')

prob = np.round(model.predict(X_tst), 6)
print(f'Predictions: {len(prob)} {prob}')

sub_df = pd.DataFrame({
    "row_id": df_tst["row_id"],
    "rule_violation": prob
})

sub_df.to_csv("submission.csv", index=False) # Save with a distinct name
print("submission.csv created successfully!")
print(sub_df.head(10))


[LightGBM] [Info] Number of positive: 4058, number of negative: 4058
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017888 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 18150
[LightGBM] [Info] Number of data points in the train set: 8116, number of used features: 1537
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Training until validation scores don't improve for 32 rounds
Early stopping, best iteration is:
[1717]	valid_0's auc: 0.911998
Probabilities: 2029 [1.13680e-02 8.31000e-04 8.58400e-03 ... 2.84000e-04 1.65257e-01
 9.97586e-01] AUC: 0.9119976130728966
Predictions: 10 [1.67330e-02 9.96500e-02 9.91338e-01 9.13102e-01 9.98173e-01 3.44510e-02
 9.57572e-01 3.09620e-02 2.20000e-04 9.84765e-01]
submission.csv created successfully!
   row_id  rule_violation
0    2029        0.016733
1    2030        0.