In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the data
train_df = pd.read_csv('train.csv')
train_labels = pd.read_csv('trainLabels.csv')
test_df = pd.read_csv('test.csv')

# Check for consistency in the number of rows
print(f"Number of rows in train.csv: {train_df.shape[0]}")
print(f"Number of rows in trainLabels.csv: {train_labels.shape[0]}")

# Ensure the data is aligned by checking indexes or any other identifiers
# If necessary, align data manually or by using indices
if train_df.shape[0] != train_labels.shape[0]:
    # Assuming both dataframes have a common identifier, e.g., 'id'
    common_ids = train_df.index.intersection(train_labels.index)
    train_df = train_df.loc[common_ids]
    train_labels = train_labels.loc[common_ids]

# Example: Using only first 5 columns as features
X = train_df.iloc[:, :5]
y = train_labels

# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature extraction using CountVectorizer (Bag of Words)
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train['content'])  # Assuming 'content' is a column
X_val_vec = vectorizer.transform(X_val['content'])

# Training the model
model = OneVsRestClassifier(LogisticRegression())
model.fit(X_train_vec, y_train)

# Predictions
y_pred = model.predict(X_val_vec)

# Evaluation
print("Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

# Preparing submission
test_vec = vectorizer.transform(test_df['content'])
test_pred = model.predict_proba(test_vec)

# Create the submission file
submission = pd.DataFrame(test_pred, columns=train_labels.columns)
submission.to_csv('submission.csv', index=False)


Number of rows in train.csv: 9999
Number of rows in trainLabels.csv: 49999


KeyError: 'content'