In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor

In [11]:
train_datasets = pd.read_csv("/kaggle/input/multilabeldataset/train.csv")
test_datasets = pd.read_csv("/kaggle/input/multilabeldataset/test.csv")

In [12]:
train_datasets.columns

Index(['text_id', 'full_text', 'cohesion', 'syntax', 'vocabulary',
       'phraseology', 'grammar', 'conventions'],
      dtype='object')

In [13]:
text_datasets = "full_text"
target_datasets = ['cohesion', 'syntax', 'vocabulary',
       'phraseology', 'grammar', 'conventions']

In [14]:
train_datasets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3911 entries, 0 to 3910
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   text_id      3911 non-null   object 
 1   full_text    3911 non-null   object 
 2   cohesion     3911 non-null   float64
 3   syntax       3911 non-null   float64
 4   vocabulary   3911 non-null   float64
 5   phraseology  3911 non-null   float64
 6   grammar      3911 non-null   float64
 7   conventions  3911 non-null   float64
dtypes: float64(6), object(2)
memory usage: 244.6+ KB


In [15]:
vectorizer = TfidfVectorizer(stop_words="english", max_features= 3000)
x = vectorizer.fit_transform(train_datasets[text_datasets])
y = train_datasets[target_datasets]

In [16]:
vectorizer

In [17]:
x

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 303453 stored elements and shape (3911, 3000)>

In [18]:
print(x)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 303453 stored elements and shape (3911, 3000)>
  Coords	Values
  (0, 2193)	0.12388494176123349
  (0, 1024)	0.07553875110659099
  (0, 2796)	0.0641914598858746
  (0, 2892)	0.040213555763507575
  (0, 2640)	0.07178817192760151
  (0, 1512)	0.03634887602788875
  (0, 816)	0.05868301073750229
  (0, 2641)	0.056701301334151966
  (0, 2778)	0.050337020985872484
  (0, 767)	0.076264996029618
  (0, 2301)	0.061881844935482724
  (0, 2095)	0.10542701876460508
  (0, 2954)	0.03523990095696329
  (0, 489)	0.10320962028435836
  (0, 492)	0.10931545987317151
  (0, 1904)	0.07343524419048768
  (0, 2424)	0.11404736697029608
  (0, 102)	0.09339938918818916
  (0, 1179)	0.08030875891857728
  (0, 2350)	0.08987951394547711
  (0, 1562)	0.1087158505098047
  (0, 1532)	0.08172652824198307
  (0, 523)	0.05731388102356634
  (0, 421)	0.14722585227177235
  (0, 2718)	0.031155707907175628
  :	:
  (3910, 2697)	0.019865044570027853
  (3910, 702)	0.038630424196333064
  (3

In [19]:
x_train, x_test, y_train, y_test  = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [20]:
model = {
    "DecisionTreeRegressor":MultiOutputRegressor(DecisionTreeRegressor(max_depth = 10, random_state=42)),
"RandomForestRegressor":MultiOutputRegressor(RandomForestRegressor(n_estimators = 10, random_state=42)) ,
"BaggingRegressor":MultiOutputRegressor(BaggingRegressor(n_estimators = 10, random_state=42)),
"AdaBoostRegressor":MultiOutputRegressor(AdaBoostRegressor(n_estimators = 10, random_state=42)),
"XBGRegressor":MultiOutputRegressor(XGBRegressor(n_estimators = 10, random_state=42))
                                             }

In [21]:
model_resulted_value_for_individual = []
for model_name, model_value in model.items():
    model_value.fit(x_train,y_train)
    model_predictions = model_value.predict(x_test)
    model_error_in_mse = mean_squared_error(y_test, model_predictions, multioutput = "raw_values")
    print(f"{model_name} : model_error_in_mse_update_average_mse: {model_error_in_mse}")
    model_error_in_mse_update_average_mse = model_error_in_mse.mean()
    model_resulted_value_for_individual.append((model_name, model_error_in_mse_update_average_mse))
    print(f"{model_name} : model_error_in_mse_update_average_mse: {model_error_in_mse_update_average_mse:.4f}")
    print("-------------------------------------------------------------------------")

DecisionTreeRegressor : model_error_in_mse_update_average_mse: [0.43573006 0.42085651 0.36319028 0.48564969 0.56142323 0.47440887]
DecisionTreeRegressor : model_error_in_mse_update_average_mse: 0.4569
-------------------------------------------------------------------------
RandomForestRegressor : model_error_in_mse_update_average_mse: [0.34056513 0.36416986 0.27259579 0.36067369 0.42810026 0.38521392]
RandomForestRegressor : model_error_in_mse_update_average_mse: 0.3586
-------------------------------------------------------------------------
BaggingRegressor : model_error_in_mse_update_average_mse: [0.34199872 0.36341315 0.27455619 0.36497126 0.43150702 0.39273627]
BaggingRegressor : model_error_in_mse_update_average_mse: 0.3615
-------------------------------------------------------------------------
AdaBoostRegressor : model_error_in_mse_update_average_mse: [0.38163007 0.3860798  0.30642461 0.39481632 0.4517351  0.43333232]
AdaBoostRegressor : model_error_in_mse_update_average_mse: