In [59]:
from google.colab import drive, files
drive.mount('/content/drive/')
output_directory = "/content/drive/My Drive/test.csv"
output_directory = "/content/drive/My Drive/train.csv"

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [60]:
print(train_data.columns)

Index(['id', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality'],
      dtype='object')


In [61]:
print(train_data.head())

   id  fixed acidity  volatile acidity  citric acid  residual sugar  \
0   0            7.7              0.63         0.00             2.4   
1   1            6.8              0.48         0.32             2.5   
2   2            6.4              0.59         0.01             2.8   
3   3            7.0              0.74         0.24             2.1   
4   4           11.5              0.32         0.32             2.8   

   chlorides  free sulfur dioxide  total sulfur dioxide  density    pH  \
0      0.078                  4.0                  14.0  0.99650  3.31   
1      0.086                 33.0                  58.0  0.99740  3.53   
2      0.086                  3.0                  10.0  0.99716  3.45   
3      0.072                 14.0                  28.0  0.99498  3.37   
4      0.082                 14.0                  37.0  0.99560  3.60   

   sulphates  alcohol  quality  
0       0.53      9.2      5.0  
1       0.49      9.7      6.0  
2       0.49      9.5      6.

In [62]:
# Load the data
train_data = pd.read_csv("/content/drive/My Drive/train.csv")
print("Columns in train_data:", train_data.columns)
print("First few rows of train_data:")
print(train_data.head())

# Ensure 'Id' and 'quality' are present
if "Id" in train_data.columns and "quality" in train_data.columns:
    X_train = train_data.drop(columns=["Id", "quality"])
else:
    print("Error: 'Id' or 'quality' column not found in train_data.")

Columns in train_data: Index(['id', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality'],
      dtype='object')
First few rows of train_data:
   id  fixed acidity  volatile acidity  citric acid  residual sugar  \
0   0            7.7              0.63         0.00             2.4   
1   1            6.8              0.48         0.32             2.5   
2   2            6.4              0.59         0.01             2.8   
3   3            7.0              0.74         0.24             2.1   
4   4           11.5              0.32         0.32             2.8   

   chlorides  free sulfur dioxide  total sulfur dioxide  density    pH  \
0      0.078                  4.0                  14.0  0.99650  3.31   
1      0.086                 33.0                  58.0  0.99740  3.53   
2      0.086                  3.0                  10.0  0.

In [63]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score

# Load the data
train_data = pd.read_csv("/content/drive/My Drive/train.csv")
test_data = pd.read_csv("/content/drive/My Drive/test.csv")

# Display columns and a few rows for debugging
print("Columns in train_data:", train_data.columns)
print("First few rows of train_data:")
print(train_data.head())

# Split the training data into features and target
X_train = train_data.drop(columns=["id", "quality"])  # Use 'id' instead of 'Id'
y_train = train_data["quality"]

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Define a function to calculate the quadratic weighted kappa
def qwk(actual, predicted):
    actual = np.asarray(actual).astype(int)
    predicted = np.asarray(predicted).astype(int)

    # Create the confusion matrix
    O = np.zeros((max(actual.max(), predicted.max()) + 1, max(actual.max(), predicted.max()) + 1))
    for a, p in zip(actual, predicted):
        O[a, p] += 1

    # Calculate weights
    N = O.sum()
    w = np.zeros(O.shape)
    for i in range(w.shape[0]):
        for j in range(w.shape[1]):
            w[i, j] = (i - j) ** 2 / (N - 1) ** 2

    # Calculate expected outcomes
    actual_hist = np.bincount(actual, minlength=w.shape[0])
    predicted_hist = np.bincount(predicted, minlength=w.shape[1])
    E = np.outer(actual_hist, predicted_hist) / N

    # Calculate QWK
    num = np.sum(w * O)
    denom = np.sum(w * E)
    kappa = 1 - (num / denom) if denom != 0 else 0
    return kappa

# Train and evaluate different regression models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Elastic Net": ElasticNet(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "Support Vector Machine": SVR()
}

best_model = None
best_score = -1  # Initialize to -1 since kappa can be negative

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    # If using regression, round predictions to nearest integer (quality)
    y_pred = np.round(y_pred).astype(int)

    score = qwk(y_val, y_pred)  # Evaluate against validation set

    print(f"{name} QWK: {score:.4f}")

    if score > best_score:
        best_model = model
        best_score = score

# Use the best model to predict the quality for the test data
X_test = test_data.drop(columns=["id"])  # Drop the id column from test data
y_test_pred = best_model.predict(X_test)

# Round predictions for quality
y_test_pred = np.round(y_test_pred).astype(int)

# Create the submission file
submission_data = pd.DataFrame({"Id": test_data["id"], "quality": y_test_pred})  # Use 'id' from test_data
submission_data.to_csv("submission.csv", index=False)

print("Submission file created successfully.")

Columns in train_data: Index(['id', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality'],
      dtype='object')
First few rows of train_data:
   id  fixed acidity  volatile acidity  citric acid  residual sugar  \
0   0            7.7              0.63         0.00             2.4   
1   1            6.8              0.48         0.32             2.5   
2   2            6.4              0.59         0.01             2.8   
3   3            7.0              0.74         0.24             2.1   
4   4           11.5              0.32         0.32             2.8   

   chlorides  free sulfur dioxide  total sulfur dioxide  density    pH  \
0      0.078                  4.0                  14.0  0.99650  3.31   
1      0.086                 33.0                  58.0  0.99740  3.53   
2      0.086                  3.0                  10.0  0.

In [58]:
from google.colab import files

# Download the submission file
files.download("submission.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>